Beispiel #1
0
def test_lahman_batting_teams(lahman_batting, lahman_teams):
    """Verify Lahman batting aggregated to the team level matches Lahman teams."""
    exclude = ['lg_id', 'team_id', 'year', 'g']
    key = ['team_id', 'year']
    cols = set(
        lahman_batting.columns) & set(lahman_teams.columns) - set(exclude)
    cols = list(cols)
    assert len(cols) == 12

    # work-around for Pandas 1.0.1 bugs
    # sum does not up-cast for nullable integer types
    # select_dtypes does not distinguish between nullable and non-nullable int types
    idx = lahman_batting[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
    for col in lahman_batting[cols].columns[idx]:
        lahman_batting[col] = lahman_batting[col].astype('Int32')

    idx = lahman_teams[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
    for col in lahman_teams[cols].columns[idx]:
        lahman_teams[col] = lahman_teams[col].astype('Int32')

    b = lahman_batting[key + cols].groupby(key).agg('sum').reset_index()

    t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True)

    # ensure the dtypes are the same
    for col in t.columns:
        if not col == 'team_id' and not col == 'year':
            b[col] = b[col].astype('int')
            t[col] = t[col].astype('int')

    assert b[cols].equals(t[cols])
Beispiel #2
0
    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
        """
        Calculates the average score of a subject and adds a new "score" column
        with it.

        If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
        score should be NA. Otherwise, the score is simply the mean of the other grades.
        The datatype of score is UInt8, and the floating point raw numbers should be
        rounded down.

        Parameters
        ----------
        maximal_nans_per_sub : int, optional
            Number of allowed NaNs per subject before giving a NA score.

        Returns
        -------
        pd.DataFrame
            A new DF with a new column - "score".
        """
        new_data = self.data
        new_data["score"] = ""

        for index, row in new_data.iterrows():
            row_list = row[['q1', 'q2', 'q3', 'q4', 'q5']]
            row_list = row_list.tolist()
            if row_list.count("nan") > maximal_nans_per_sub:
                new_data.loc[index, "score"] = pd.NA
            else:
                mean = row[['q1', 'q2', 'q3', 'q4', 'q5']].astype(float).mean()
                new_data.loc[index, "score"] = np.uint8(mean)

        new_data["score"] = new_data["score"].astype(pd.UInt8Dtype())
        return new_data
Beispiel #3
0
    def coerce_not_float_cols_nans(cls, self):
        """Coerce cols with floats and nans to the correct integer dtype."""
        cols = self.not_float_cols_nans

        int8_val = 127
        int16_val = 32767
        int32_val = 2147483648

        for col in cols:
            min = self.df[col].min()
            max = self.df[col].max()
            if min >= 0:
                if max < 255:
                    self.df[col] = self.df[col].astype(pd.UInt8Dtype())
                elif max < 65535:
                    self.df[col] = self.df[col].astype(pd.UInt16Dtype())
                elif max < 4294967295:
                    self.df[col] = self.df[col].astype(pd.UInt32Dtype())
            else:
                if min > -int8_val and max < int8_val:
                    self.df[col] = self.df[col].astype(pd.Int8Dtype())
                elif min > -int16_val and max < int16_val:
                    self.df[col] = self.df[col].astype(pd.Int16Dtype())
                elif min > -int32_val and max < int32_val:
                    self.df[col] = self.df[col].astype(pd.Int32Dtype())
Beispiel #4
0
def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }
def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
Beispiel #6
0
    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
       """Calculates the average score of a subject and adds a new "score" column
       with it.

       If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
       score should be NA. Otherwise, the score is simply the mean of the other grades.
       The datatype of score is UInt8, and the floating point raw numbers should be
       rounded down.

       Parameters
       ----------
       maximal_nans_per_sub : int, optional
           Number of allowed NaNs per subject before giving a NA score.

       Returns
       -------
       pd.DataFrame
           A new DF with a new column - "score".
       """
       df = self.data.copy()
       grades = df.iloc[:,-5:]
       df["score"] = grades.mean(axis = 1).apply(np.floor).astype(pd.UInt8Dtype())
       null_grades = pd.isnull(grades).sum(axis = 1)
       df.loc[null_grades > maximal_nans_per_sub, "score"] = pd.NA
       return df
Beispiel #7
0
    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
Beispiel #8
0
def add_freq_notes_df(sid, ssfs_df):
    real_freqs = {freq: freq * sid.freq_scaler for freq in ssfs_df['freq1'].unique() if pd.notna(freq)}
    closest_notes = {real_freq: closest_midi(real_freq)[1] for real_freq in real_freqs.values()}
    freq_map = [(freq, real_freq, closest_notes[real_freq]) for freq, real_freq in real_freqs.items()]
    freq_map.extend([(pd.NA, pd.NA, pd.NA)])
    freq_notes_df = pd.DataFrame.from_records(freq_map, columns=['freq1', 'real_freq', 'closest_note']).astype(pd.Float64Dtype())
    freq_notes_df['freq1'] = freq_notes_df['freq1'].astype(pd.UInt16Dtype())
    freq_notes_df['closest_note'] = freq_notes_df['closest_note'].astype(pd.UInt8Dtype())
    return set_sid_dtype(ssfs_df).merge(freq_notes_df, how='left', on='freq1')
def test_intdtypes() -> None:
    pd.Int8Dtype()
    pd.Int16Dtype()
    pd.Int32Dtype()
    pd.Int64Dtype()
    pd.UInt8Dtype()
    pd.UInt16Dtype()
    pd.UInt32Dtype()
    pd.UInt64Dtype()
Beispiel #10
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Beispiel #11
0
def set_sid_dtype(df):
    df.dtype = pd.UInt64Dtype()
    for col in df.columns:
        if col.startswith('freq') or col.startswith(
                'pwduty') or col == 'fltcoff':
            col_type = pd.UInt16Dtype()
        elif col[-1].isdigit() or col.startswith('flt'):
            col_type = pd.UInt8Dtype()
        else:
            continue
        df[col] = df[col].astype(col_type)
    return df
Beispiel #12
0
 def test_numeric_dtypes(self):
     dtypes = [
         bool,
         np.byte,
         np.ubyte,
         np.short,
         np.ushort,
         np.single,
         np.int32,
         np.intc,
         np.half,
         np.float16,
         np.double,
         np.float64,
         pd.StringDtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
     ]
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".xml", "xml"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         with tmpfile(suffix) as path:
             for dtype in dtypes:
                 try:
                     df = Ind2Col2.convert(Ind2Col2(
                         sample_data_ind2_col2())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
Beispiel #13
0
def test_lahman_retro_fielding_data(fielding, lahman_fielding):
    """Compare Aggregated Lahman fielding per position data to
    Aggregated Retrosheet fielding per position data."""
    # find the common columns
    f_cols = set(lahman_fielding.columns) & set(fielding.columns)
    f_cols -= {'player_id', 'pos', 'team_id', 'year'}
    f_cols = list(f_cols)

    # work-around for Pandas 1.0.1 bugs
    # sum does not up-cast for nullable integer types
    # select_dtypes does not distinguish between nullable and non-nullable int types
    idx = lahman_fielding[f_cols].dtypes.isin(
        [pd.UInt8Dtype(), pd.UInt16Dtype()])
    for col in lahman_fielding[f_cols].columns[idx]:
        lahman_fielding[col] = lahman_fielding[col].astype('Int32')

    l_sums = lahman_fielding.groupby('pos')[f_cols].agg('sum')
    l_sums.sort_index(inplace=True)

    # there are 7 fielding attributes and 7 fielding positions in Lahman
    assert l_sums.shape == (7, 7)

    r_sums = fielding.groupby('pos')[f_cols].agg('sum').astype('int')

    # Lahman uses OF for sum of LF, CF, RF
    r_sums.loc['OF'] = r_sums.loc['LF'] + r_sums.loc['CF'] + r_sums.loc['RF']
    r_sums = r_sums.drop(['LF', 'CF', 'RF'])
    r_sums.sort_index(inplace=True)

    # there are now 7 fielding attributes and 7 fielding positions in Retrosheet sums
    assert r_sums.shape == (7, 7)

    # the indexes and columns should now be the same
    assert l_sums.index.equals(r_sums.index)
    assert l_sums.columns.equals(r_sums.columns)

    filt = fielding['pos'].isin(['LF', 'CF', 'RF'])
    r_of = fielding[filt]

    # account for outfielders who played more than 1 outfield position in the same game
    total_dups = r_of.duplicated(subset=['player_id', 'game_id'],
                                 keep=False).sum()
    counted_dups = r_of.duplicated(subset=['player_id', 'game_id'],
                                   keep='first').sum()
    r_sums.loc['OF', 'g'] -= (total_dups - counted_dups)

    rel_accuarcy = l_sums / r_sums

    # relative accuracy is within 0.8% for all 49 aggregated values
    assert (np.abs(1.0 - rel_accuarcy) < 0.008).all().all()
Beispiel #14
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Beispiel #15
0
 def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
     df = self.data
     values = []
     for index, row in df.iterrows():
         count = 0
         for i in range(1,6):
             if math.isnan(row['q{0}'.format(i)]):
                 count +=1
         if count > maximal_nans_per_sub:
             values.append(None)
         else:
             values.append(int(np.nanmean([row['q1'],row['q2'],row['q3'],row['q4'],row['q5']])))
     df['score'] = values
     df['score'] = df['score'].astype(pd.UInt8Dtype())
     self.data = df
     return self.data
Beispiel #16
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
Beispiel #17
0
def test_to_pandas_dtype_integer_nullable():
    expectations = {
        (-100, 100): pd.Int8Dtype(),
        (0, 240): pd.UInt8Dtype(),
        (-10000, 10000): pd.Int16Dtype(),
        (500, 40000): pd.UInt16Dtype(),
        (-200000000, 200000000): pd.Int32Dtype(),
        (25, 4000000000): pd.UInt32Dtype(),
        (-9000000000000000000, 2000000000): pd.Int64Dtype(),
        (25, 10000000000000000000): pd.UInt64Dtype(),
        (25, 1000000000000000000000000000): np.float128,
        (None, None): pd.Int64Dtype(),
    }
    for (min_, max_), expected_pandas_type in expectations.items():
        constraints = RecordsSchemaFieldIntegerConstraints(required=True,
                                                           unique=None,
                                                           min_=min_,
                                                           max_=max_)
        yield with_nullable(
            True, check_dtype), "integer", constraints, expected_pandas_type
Beispiel #18
0
    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
        """Calculates the average score of a subject and adds a new "score" column
        with it.

        If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
        score should be NA. Otherwise, the score is simply the mean of the other grades.
        The datatype of score is UInt8, and the floating point raw numbers should be
        rounded down.

        Parameters
        ----------
        maximal_nans_per_sub : int, optional
        Number of allowed NaNs per subject before giving a NA score.

        Returns
        -------
        pd.DataFrame
        A new DF with a new column - "score".
        """

        data_df = self.read_data()  # Put the data into a data frame
        questions_ans = data_df.loc[:,
                                    'q1':'q5']  # Indexes of the wanted columns
        number_of_nan = questions_ans.isnull().sum(axis=1).tolist()
        floored_mean_score = np.floor(
            questions_ans.mean(axis=1))  #.astype(pd.UInt8Dtype()))

        for idx, num in enumerate(number_of_nan):
            if num > maximal_nans_per_sub:
                floored_mean_score[idx] = None
            else:
                continue

        data_df['score'] = floored_mean_score.astype(pd.UInt8Dtype())

        return data_df
Beispiel #19
0
# Copyright (c) 2020, NVIDIA CORPORATION.

import random

import pandas as pd
import pyarrow as pa

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}


def _generate_rand_meta(obj, dtypes_list):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):
        dtype = random.choice(dtypes_list)
        null_frequency = random.uniform(0, 1)
Beispiel #20
0
    sliced_sr[3] = None
    assert sliced_sr.memory_usage() == 80

    sr = cudf.Series(["hello world", "rapids ai", "abc", "z"])
    assert sr.memory_usage() == 44

    assert sr[3:].memory_usage() == 9  # z
    assert sr[:1].memory_usage() == 19  # hello world


@pytest.mark.parametrize(
    "sr,expected_psr",
    [
        (
            cudf.Series([1, 2, None, 3], dtype="uint8"),
            pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
        ),
        (
            cudf.Series([23, None, None, 32], dtype="uint16"),
            pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
        ),
        (
            cudf.Series([None, 123, None, 1], dtype="uint32"),
            pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
        ),
        (
            cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
            pd.Series([234, 2323, 23432, None, None, 224],
                      dtype=pd.UInt64Dtype()),
        ),
        (
Beispiel #21
0
def pandas_type_casting(df):
    import numpy as np
    import pandas as pd
    global n
    # df = pd.read_csv("users-isprep.zip")
    old = df.memory_usage() / 1024/1024
    
    #numeric cols
    number_cols = list(df.select_dtypes("number").columns)
    n = df[number_cols].fillna(0).agg([min,max]).T.add_suffix("_")
            
    get_cols_names(0, 255, pd.UInt8Dtype())
    get_cols_names(256, 65535, pd.UInt16Dtype())
    get_cols_names(65536, 4294967295, pd.UInt32Dtype())

    get_cols_names(-128, 127, pd.Int8Dtype())
    get_cols_names(-32768, 32767, pd.Int16Dtype())
    get_cols_names(-2147483648, 2147483647, pd.Int32Dtype())

    # date and catagorical datacols
    catagoriacal_cols = list(df.select_dtypes("O").columns)
    date_cols = []
    for i in catagoriacal_cols:
        x = df[i][~df[i].isna()].head()
        try:
            pd.to_datetime(x)
            date_cols.append(i)
        except:
            pass
    catagoriacal_cols = [i for i in catagoriacal_cols if not i in date_cols]

    c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100)
    for i in c[c<5].index:
        d[i] = "category"
        
    # del df
    # df = pd.read_csv("users-isprep.zip", parse_dates=date_cols, dtype=d)
    for i in d:
        df[i] = df[i].astype(d[i])
    new = df.memory_usage() / 1024/1024

    m = pd.DataFrame({"new" : new,
                      "old" : old,
                      "Imporovement" : old - new})
    m['Dtype'] = [None] + list(df[list(new.index.drop("Index"))].dtypes.astype(str).values)

    c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100)

    m["nunique"] = None
    m.loc[c.index, "nunique"] = list(df[c.index].apply(lambda x:x.nunique() / len(df) * 100).values)

    print("Before :", round(m.old.sum()))
    print("After  :", round(m.new.sum()))
    print("Diff   :", round(m.Imporovement.sum()))
    print("Diff % :", round(m.Imporovement.sum()/m.old.sum(), 2))

    print("\n\nImprovement:")
    print(m.groupby("Dtype").Imporovement.agg([min, max, sum, np.mean, np.median, "count"]))

    print("\n\nDetailed Summary:")
    print(m.to_string())
    return df
Beispiel #22
0
    'float16': (parquet_thrift.Type.FLOAT, None, 16),
}

revmap = {
    parquet_thrift.Type.INT32: np.int32,
    parquet_thrift.Type.INT64: np.int64,
    parquet_thrift.Type.FLOAT: np.float32,
    parquet_thrift.Type.DOUBLE: np.float64
}

pdoptional_to_numpy_typemap = {
    pd.Int8Dtype(): np.int8,
    pd.Int16Dtype(): np.int16,
    pd.Int32Dtype(): np.int32,
    pd.Int64Dtype(): np.int64,
    pd.UInt8Dtype(): np.uint8,
    pd.UInt16Dtype(): np.uint16,
    pd.UInt32Dtype(): np.uint32,
    pd.UInt64Dtype(): np.uint64,
    pd.BooleanDtype(): np.bool
}


def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
    """ Get appropriate typecodes for column dtype

    Data conversion do not happen here, see convert().

    The user is expected to transform their data into the appropriate dtype
    before saving to parquet, we will not make any assumptions for them.
Beispiel #23
0
    np.int64: SqlTypeName.BIGINT,
    pd.Int64Dtype(): SqlTypeName.BIGINT,
    np.int32: SqlTypeName.INTEGER,
    pd.Int32Dtype(): SqlTypeName.INTEGER,
    np.int16: SqlTypeName.SMALLINT,
    pd.Int16Dtype(): SqlTypeName.SMALLINT,
    np.int8: SqlTypeName.TINYINT,
    pd.Int8Dtype(): SqlTypeName.TINYINT,
    np.uint64: SqlTypeName.BIGINT,
    pd.UInt64Dtype(): SqlTypeName.BIGINT,
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

# Default mapping between SQL types and python types
# for values
_SQL_TO_PYTHON_SCALARS = {
    "DOUBLE": np.float64,
    "FLOAT": np.float32,
    "DECIMAL": np.float32,
    "BIGINT": np.int64,
    "INTEGER": np.int32,
Beispiel #24
0
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
Beispiel #25
0
class UINT8(UINT16):
    """Semantic representation of a :class:`pandas.UInt8Dtype`."""

    type = pd.UInt8Dtype()
    bit_width: int = 8
Beispiel #26
0
    """Semantic representation of a :class:`pandas.UInt32Dtype`."""

    type = pd.UInt32Dtype()
    bit_width: int = 32


@Engine.register_dtype(equivalents=[pd.UInt16Dtype, pd.UInt16Dtype()])
@immutable
class UINT16(UINT32):
    """Semantic representation of a :class:`pandas.UInt16Dtype`."""

    type = pd.UInt16Dtype()
    bit_width: int = 16


@Engine.register_dtype(equivalents=[pd.UInt8Dtype, pd.UInt8Dtype()])
@immutable
class UINT8(UINT16):
    """Semantic representation of a :class:`pandas.UInt8Dtype`."""

    type = pd.UInt8Dtype()
    bit_width: int = 8


# ###############################################################################
# # float
# ###############################################################################

_register_numpy_numbers(
    builtin_name="float",
    pandera_name="Float",
Beispiel #27
0
    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
Beispiel #28
0
    parquet_thrift.ConvertedType.INT_8: np.dtype("int8"),
    parquet_thrift.ConvertedType.INT_16: np.dtype("int16"),
    parquet_thrift.ConvertedType.INT_32: np.dtype('int32'),
    parquet_thrift.ConvertedType.INT_64: np.dtype('int64'),
    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
}
nullable = {
    np.dtype('int8'): pd.Int8Dtype(),
    np.dtype('int16'): pd.Int16Dtype(),
    np.dtype('int32'): pd.Int32Dtype(),
    np.dtype('int64'): pd.Int64Dtype(),
    np.dtype('uint8'): pd.UInt8Dtype(),
    np.dtype('uint16'): pd.UInt16Dtype(),
    np.dtype('uint32'): pd.UInt32Dtype(),
    np.dtype('uint64'): pd.UInt64Dtype(),
    np.dtype('bool'): pd.BooleanDtype()
}
pandas_nullable = {
    "Int8": pd.Int8Dtype(),
    "Int16": pd.Int16Dtype(),
    "Int32": pd.Int32Dtype(),
    "Int64": pd.Int64Dtype(),
    "UInt8": pd.UInt8Dtype(),
    "UInt16": pd.UInt16Dtype(),
    "UInt32": pd.UInt32Dtype(),
    "UInt64": pd.UInt64Dtype(),
    "boolean": pd.BooleanDtype()
Beispiel #29
0
    "int64": "Int64",
    "int32": "Int32",
    "int16": "Int16",
    "int8": "Int8",
    "datetime64[D]": "Date",
    "datetime64[ns]": "DateTime",
}

PD2CH = keymap(np.dtype, MAPPING)

PD_INT_TYPES = [
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.Int64Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype(),
]

for typ in PD_INT_TYPES:
    PD2CH[typ] = f"Nullable({typ.name})"

CH2PD = itemmap(reversed, MAPPING)
CH2PD["Null"] = "object"
CH2PD["Nothing"] = "object"

NULLABLE_COLS = [
    "Float64",
    "Float32",
Beispiel #30
0
        ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], {}, {},),
    ],
)
def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
    pyarrow_data = pa.array(data, **pyarrow_kwargs)
    cudf_from_pyarrow = as_column(pyarrow_data)
    expected = as_column(data, **cudf_kwargs)
    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]