Python UInt8Dtype Beispiele, pandas.UInt8Dtype Python Beispiele

Beispiel #1

0

Datei anzeigen

def test_lahman_batting_teams(lahman_batting, lahman_teams):
    """Verify Lahman batting aggregated to the team level matches Lahman teams."""
    exclude = ['lg_id', 'team_id', 'year', 'g']
    key = ['team_id', 'year']
    cols = set(
        lahman_batting.columns) & set(lahman_teams.columns) - set(exclude)
    cols = list(cols)
    assert len(cols) == 12

    # work-around for Pandas 1.0.1 bugs
    # sum does not up-cast for nullable integer types
    # select_dtypes does not distinguish between nullable and non-nullable int types
    idx = lahman_batting[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
    for col in lahman_batting[cols].columns[idx]:
        lahman_batting[col] = lahman_batting[col].astype('Int32')

    idx = lahman_teams[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
    for col in lahman_teams[cols].columns[idx]:
        lahman_teams[col] = lahman_teams[col].astype('Int32')

    b = lahman_batting[key + cols].groupby(key).agg('sum').reset_index()

    t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True)

    # ensure the dtypes are the same
    for col in t.columns:
        if not col == 'team_id' and not col == 'year':
            b[col] = b[col].astype('int')
            t[col] = t[col].astype('int')

    assert b[cols].equals(t[cols])

Beispiel #2

0

Datei anzeigen

    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
        """
        Calculates the average score of a subject and adds a new "score" column
        with it.

        If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
        score should be NA. Otherwise, the score is simply the mean of the other grades.
        The datatype of score is UInt8, and the floating point raw numbers should be
        rounded down.

        Parameters
        ----------
        maximal_nans_per_sub : int, optional
            Number of allowed NaNs per subject before giving a NA score.

        Returns
        -------
        pd.DataFrame
            A new DF with a new column - "score".
        """
        new_data = self.data
        new_data["score"] = ""

        for index, row in new_data.iterrows():
            row_list = row[['q1', 'q2', 'q3', 'q4', 'q5']]
            row_list = row_list.tolist()
            if row_list.count("nan") > maximal_nans_per_sub:
                new_data.loc[index, "score"] = pd.NA
            else:
                mean = row[['q1', 'q2', 'q3', 'q4', 'q5']].astype(float).mean()
                new_data.loc[index, "score"] = np.uint8(mean)

        new_data["score"] = new_data["score"].astype(pd.UInt8Dtype())
        return new_data

Beispiel #3

0

Datei anzeigen

    def coerce_not_float_cols_nans(cls, self):
        """Coerce cols with floats and nans to the correct integer dtype."""
        cols = self.not_float_cols_nans

        int8_val = 127
        int16_val = 32767
        int32_val = 2147483648

        for col in cols:
            min = self.df[col].min()
            max = self.df[col].max()
            if min >= 0:
                if max < 255:
                    self.df[col] = self.df[col].astype(pd.UInt8Dtype())
                elif max < 65535:
                    self.df[col] = self.df[col].astype(pd.UInt16Dtype())
                elif max < 4294967295:
                    self.df[col] = self.df[col].astype(pd.UInt32Dtype())
            else:
                if min > -int8_val and max < int8_val:
                    self.df[col] = self.df[col].astype(pd.Int8Dtype())
                elif min > -int16_val and max < int16_val:
                    self.df[col] = self.df[col].astype(pd.Int16Dtype())
                elif min > -int32_val and max < int32_val:
                    self.df[col] = self.df[col].astype(pd.Int32Dtype())

Beispiel #4

0

Datei anzeigen

def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }

Beispiel #5

0

Datei anzeigen

Datei: _data_types.py Projekt: telegit/aws-data-wrangler

def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None

Beispiel #6

0

Datei anzeigen

    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
       """Calculates the average score of a subject and adds a new "score" column
       with it.

       If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
       score should be NA. Otherwise, the score is simply the mean of the other grades.
       The datatype of score is UInt8, and the floating point raw numbers should be
       rounded down.

       Parameters
       ----------
       maximal_nans_per_sub : int, optional
           Number of allowed NaNs per subject before giving a NA score.

       Returns
       -------
       pd.DataFrame
           A new DF with a new column - "score".
       """
       df = self.data.copy()
       grades = df.iloc[:,-5:]
       df["score"] = grades.mean(axis = 1).apply(np.floor).astype(pd.UInt8Dtype())
       null_grades = pd.isnull(grades).sum(axis = 1)
       df.loc[null_grades > maximal_nans_per_sub, "score"] = pd.NA
       return df

Beispiel #7

0

Datei anzeigen

    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()

Beispiel #8

0

Datei anzeigen

Datei: ssf.py Projekt: anarkiwi/desidulate

def add_freq_notes_df(sid, ssfs_df):
    real_freqs = {freq: freq * sid.freq_scaler for freq in ssfs_df['freq1'].unique() if pd.notna(freq)}
    closest_notes = {real_freq: closest_midi(real_freq)[1] for real_freq in real_freqs.values()}
    freq_map = [(freq, real_freq, closest_notes[real_freq]) for freq, real_freq in real_freqs.items()]
    freq_map.extend([(pd.NA, pd.NA, pd.NA)])
    freq_notes_df = pd.DataFrame.from_records(freq_map, columns=['freq1', 'real_freq', 'closest_note']).astype(pd.Float64Dtype())
    freq_notes_df['freq1'] = freq_notes_df['freq1'].astype(pd.UInt16Dtype())
    freq_notes_df['closest_note'] = freq_notes_df['closest_note'].astype(pd.UInt8Dtype())
    return set_sid_dtype(ssfs_df).merge(freq_notes_df, how='left', on='freq1')

Beispiel #9

0

Datei anzeigen

Datei: pandas_test.py Projekt: eganjs/data-science-types

def test_intdtypes() -> None:
    pd.Int8Dtype()
    pd.Int16Dtype()
    pd.Int32Dtype()
    pd.Int64Dtype()
    pd.UInt8Dtype()
    pd.UInt16Dtype()
    pd.UInt32Dtype()
    pd.UInt64Dtype()

Beispiel #10

0

Datei anzeigen

Datei: parquet.py Projekt: zjkanjie/pandas

    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()

Beispiel #11

0

Datei anzeigen

Datei: sidlib.py Projekt: anarkiwi/desidulate

def set_sid_dtype(df):
    df.dtype = pd.UInt64Dtype()
    for col in df.columns:
        if col.startswith('freq') or col.startswith(
                'pwduty') or col == 'fltcoff':
            col_type = pd.UInt16Dtype()
        elif col[-1].isdigit() or col.startswith('flt'):
            col_type = pd.UInt8Dtype()
        else:
            continue
        df[col] = df[col].astype(col_type)
    return df

Beispiel #12

0

Datei anzeigen

 def test_numeric_dtypes(self):
     dtypes = [
         bool,
         np.byte,
         np.ubyte,
         np.short,
         np.ushort,
         np.single,
         np.int32,
         np.intc,
         np.half,
         np.float16,
         np.double,
         np.float64,
         pd.StringDtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
     ]
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".xml", "xml"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         with tmpfile(suffix) as path:
             for dtype in dtypes:
                 try:
                     df = Ind2Col2.convert(Ind2Col2(
                         sample_data_ind2_col2())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise

Beispiel #13

0

Datei anzeigen

def test_lahman_retro_fielding_data(fielding, lahman_fielding):
    """Compare Aggregated Lahman fielding per position data to
    Aggregated Retrosheet fielding per position data."""
    # find the common columns
    f_cols = set(lahman_fielding.columns) & set(fielding.columns)
    f_cols -= {'player_id', 'pos', 'team_id', 'year'}
    f_cols = list(f_cols)

    # work-around for Pandas 1.0.1 bugs
    # sum does not up-cast for nullable integer types
    # select_dtypes does not distinguish between nullable and non-nullable int types
    idx = lahman_fielding[f_cols].dtypes.isin(
        [pd.UInt8Dtype(), pd.UInt16Dtype()])
    for col in lahman_fielding[f_cols].columns[idx]:
        lahman_fielding[col] = lahman_fielding[col].astype('Int32')

    l_sums = lahman_fielding.groupby('pos')[f_cols].agg('sum')
    l_sums.sort_index(inplace=True)

    # there are 7 fielding attributes and 7 fielding positions in Lahman
    assert l_sums.shape == (7, 7)

    r_sums = fielding.groupby('pos')[f_cols].agg('sum').astype('int')

    # Lahman uses OF for sum of LF, CF, RF
    r_sums.loc['OF'] = r_sums.loc['LF'] + r_sums.loc['CF'] + r_sums.loc['RF']
    r_sums = r_sums.drop(['LF', 'CF', 'RF'])
    r_sums.sort_index(inplace=True)

    # there are now 7 fielding attributes and 7 fielding positions in Retrosheet sums
    assert r_sums.shape == (7, 7)

    # the indexes and columns should now be the same
    assert l_sums.index.equals(r_sums.index)
    assert l_sums.columns.equals(r_sums.columns)

    filt = fielding['pos'].isin(['LF', 'CF', 'RF'])
    r_of = fielding[filt]

    # account for outfielders who played more than 1 outfield position in the same game
    total_dups = r_of.duplicated(subset=['player_id', 'game_id'],
                                 keep=False).sum()
    counted_dups = r_of.duplicated(subset=['player_id', 'game_id'],
                                   keep='first').sum()
    r_sums.loc['OF', 'g'] -= (total_dups - counted_dups)

    rel_accuarcy = l_sums / r_sums

    # relative accuracy is within 0.8% for all 49 aggregated values
    assert (np.abs(1.0 - rel_accuarcy) < 0.008).all().all()

Beispiel #14

0

Datei anzeigen

Datei: parquet.py Projekt: tnir/pandas

    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()

Beispiel #15

0

Datei anzeigen

Datei: hw5.py Projekt: InbarFischer/hw5

 def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
     df = self.data
     values = []
     for index, row in df.iterrows():
         count = 0
         for i in range(1,6):
             if math.isnan(row['q{0}'.format(i)]):
                 count +=1
         if count > maximal_nans_per_sub:
             values.append(None)
         else:
             values.append(int(np.nanmean([row['q1'],row['q2'],row['q3'],row['q4'],row['q5']])))
     df['score'] = values
     df['score'] = df['score'].astype(pd.UInt8Dtype())
     self.data = df
     return self.data

Beispiel #16

0

Datei anzeigen

 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise

Beispiel #17

0

Datei anzeigen

Datei: test_dtype.py Projekt: cwegrzyn/records-mover

def test_to_pandas_dtype_integer_nullable():
    expectations = {
        (-100, 100): pd.Int8Dtype(),
        (0, 240): pd.UInt8Dtype(),
        (-10000, 10000): pd.Int16Dtype(),
        (500, 40000): pd.UInt16Dtype(),
        (-200000000, 200000000): pd.Int32Dtype(),
        (25, 4000000000): pd.UInt32Dtype(),
        (-9000000000000000000, 2000000000): pd.Int64Dtype(),
        (25, 10000000000000000000): pd.UInt64Dtype(),
        (25, 1000000000000000000000000000): np.float128,
        (None, None): pd.Int64Dtype(),
    }
    for (min_, max_), expected_pandas_type in expectations.items():
        constraints = RecordsSchemaFieldIntegerConstraints(required=True,
                                                           unique=None,
                                                           min_=min_,
                                                           max_=max_)
        yield with_nullable(
            True, check_dtype), "integer", constraints, expected_pandas_type

Beispiel #18

0

Datei anzeigen

Datei: hw5.py Projekt: Stas23k/hw5

    def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame:
        """Calculates the average score of a subject and adds a new "score" column
        with it.

        If the subject has more than "maximal_nans_per_sub" NaN in his grades, the
        score should be NA. Otherwise, the score is simply the mean of the other grades.
        The datatype of score is UInt8, and the floating point raw numbers should be
        rounded down.

        Parameters
        ----------
        maximal_nans_per_sub : int, optional
        Number of allowed NaNs per subject before giving a NA score.

        Returns
        -------
        pd.DataFrame
        A new DF with a new column - "score".
        """

        data_df = self.read_data()  # Put the data into a data frame
        questions_ans = data_df.loc[:,
                                    'q1':'q5']  # Indexes of the wanted columns
        number_of_nan = questions_ans.isnull().sum(axis=1).tolist()
        floored_mean_score = np.floor(
            questions_ans.mean(axis=1))  #.astype(pd.UInt8Dtype()))

        for idx, num in enumerate(number_of_nan):
            if num > maximal_nans_per_sub:
                floored_mean_score[idx] = None
            else:
                continue

        data_df['score'] = floored_mean_score.astype(pd.UInt8Dtype())

        return data_df

Beispiel #19

0

Datei anzeigen

Datei: utils.py Projekt: wphicks/cudf

# Copyright (c) 2020, NVIDIA CORPORATION.

import random

import pandas as pd
import pyarrow as pa

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}


def _generate_rand_meta(obj, dtypes_list):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):
        dtype = random.choice(dtypes_list)
        null_frequency = random.uniform(0, 1)

Beispiel #20

0

Datei anzeigen

Datei: test_series.py Projekt: miguelusque/cudf

    sliced_sr[3] = None
    assert sliced_sr.memory_usage() == 80

    sr = cudf.Series(["hello world", "rapids ai", "abc", "z"])
    assert sr.memory_usage() == 44

    assert sr[3:].memory_usage() == 9  # z
    assert sr[:1].memory_usage() == 19  # hello world


@pytest.mark.parametrize(
    "sr,expected_psr",
    [
        (
            cudf.Series([1, 2, None, 3], dtype="uint8"),
            pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
        ),
        (
            cudf.Series([23, None, None, 32], dtype="uint16"),
            pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
        ),
        (
            cudf.Series([None, 123, None, 1], dtype="uint32"),
            pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
        ),
        (
            cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
            pd.Series([234, 2323, 23432, None, None, 224],
                      dtype=pd.UInt64Dtype()),
        ),
        (

Beispiel #21

0

Datei anzeigen

def pandas_type_casting(df):
    import numpy as np
    import pandas as pd
    global n
    # df = pd.read_csv("users-isprep.zip")
    old = df.memory_usage() / 1024/1024
    
    #numeric cols
    number_cols = list(df.select_dtypes("number").columns)
    n = df[number_cols].fillna(0).agg([min,max]).T.add_suffix("_")
            
    get_cols_names(0, 255, pd.UInt8Dtype())
    get_cols_names(256, 65535, pd.UInt16Dtype())
    get_cols_names(65536, 4294967295, pd.UInt32Dtype())

    get_cols_names(-128, 127, pd.Int8Dtype())
    get_cols_names(-32768, 32767, pd.Int16Dtype())
    get_cols_names(-2147483648, 2147483647, pd.Int32Dtype())

    # date and catagorical datacols
    catagoriacal_cols = list(df.select_dtypes("O").columns)
    date_cols = []
    for i in catagoriacal_cols:
        x = df[i][~df[i].isna()].head()
        try:
            pd.to_datetime(x)
            date_cols.append(i)
        except:
            pass
    catagoriacal_cols = [i for i in catagoriacal_cols if not i in date_cols]

    c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100)
    for i in c[c<5].index:
        d[i] = "category"
        
    # del df
    # df = pd.read_csv("users-isprep.zip", parse_dates=date_cols, dtype=d)
    for i in d:
        df[i] = df[i].astype(d[i])
    new = df.memory_usage() / 1024/1024

    m = pd.DataFrame({"new" : new,
                      "old" : old,
                      "Imporovement" : old - new})
    m['Dtype'] = [None] + list(df[list(new.index.drop("Index"))].dtypes.astype(str).values)

    c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100)

    m["nunique"] = None
    m.loc[c.index, "nunique"] = list(df[c.index].apply(lambda x:x.nunique() / len(df) * 100).values)

    print("Before :", round(m.old.sum()))
    print("After  :", round(m.new.sum()))
    print("Diff   :", round(m.Imporovement.sum()))
    print("Diff % :", round(m.Imporovement.sum()/m.old.sum(), 2))

    print("\n\nImprovement:")
    print(m.groupby("Dtype").Imporovement.agg([min, max, sum, np.mean, np.median, "count"]))

    print("\n\nDetailed Summary:")
    print(m.to_string())
    return df

Beispiel #22

0

Datei anzeigen

    'float16': (parquet_thrift.Type.FLOAT, None, 16),
}

revmap = {
    parquet_thrift.Type.INT32: np.int32,
    parquet_thrift.Type.INT64: np.int64,
    parquet_thrift.Type.FLOAT: np.float32,
    parquet_thrift.Type.DOUBLE: np.float64
}

pdoptional_to_numpy_typemap = {
    pd.Int8Dtype(): np.int8,
    pd.Int16Dtype(): np.int16,
    pd.Int32Dtype(): np.int32,
    pd.Int64Dtype(): np.int64,
    pd.UInt8Dtype(): np.uint8,
    pd.UInt16Dtype(): np.uint16,
    pd.UInt32Dtype(): np.uint32,
    pd.UInt64Dtype(): np.uint64,
    pd.BooleanDtype(): np.bool
}


def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
    """ Get appropriate typecodes for column dtype

    Data conversion do not happen here, see convert().

    The user is expected to transform their data into the appropriate dtype
    before saving to parquet, we will not make any assumptions for them.

Beispiel #23

0

Datei anzeigen

    np.int64: SqlTypeName.BIGINT,
    pd.Int64Dtype(): SqlTypeName.BIGINT,
    np.int32: SqlTypeName.INTEGER,
    pd.Int32Dtype(): SqlTypeName.INTEGER,
    np.int16: SqlTypeName.SMALLINT,
    pd.Int16Dtype(): SqlTypeName.SMALLINT,
    np.int8: SqlTypeName.TINYINT,
    pd.Int8Dtype(): SqlTypeName.TINYINT,
    np.uint64: SqlTypeName.BIGINT,
    pd.UInt64Dtype(): SqlTypeName.BIGINT,
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

# Default mapping between SQL types and python types
# for values
_SQL_TO_PYTHON_SCALARS = {
    "DOUBLE": np.float64,
    "FLOAT": np.float32,
    "DECIMAL": np.float32,
    "BIGINT": np.int64,
    "INTEGER": np.int32,

Beispiel #24

0

Datei anzeigen

Datei: dtypes.py Projekt: mikest18/cudf

    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),

Beispiel #25

0

Datei anzeigen

Datei: pandas_engine.py Projekt: pandera-dev/pandera

class UINT8(UINT16):
    """Semantic representation of a :class:`pandas.UInt8Dtype`."""

    type = pd.UInt8Dtype()
    bit_width: int = 8

Beispiel #26

0

Datei anzeigen

Datei: pandas_engine.py Projekt: pandera-dev/pandera

    """Semantic representation of a :class:`pandas.UInt32Dtype`."""

    type = pd.UInt32Dtype()
    bit_width: int = 32


@Engine.register_dtype(equivalents=[pd.UInt16Dtype, pd.UInt16Dtype()])
@immutable
class UINT16(UINT32):
    """Semantic representation of a :class:`pandas.UInt16Dtype`."""

    type = pd.UInt16Dtype()
    bit_width: int = 16


@Engine.register_dtype(equivalents=[pd.UInt8Dtype, pd.UInt8Dtype()])
@immutable
class UINT8(UINT16):
    """Semantic representation of a :class:`pandas.UInt8Dtype`."""

    type = pd.UInt8Dtype()
    bit_width: int = 8


# ###############################################################################
# # float
# ###############################################################################

_register_numpy_numbers(
    builtin_name="float",
    pandera_name="Float",

Beispiel #27

0

Datei anzeigen

    np.longlong: pa.int64(),
    np.int32: pa.int32(),
    np.int16: pa.int16(),
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}

Beispiel #28

0

Datei anzeigen

    parquet_thrift.ConvertedType.INT_8: np.dtype("int8"),
    parquet_thrift.ConvertedType.INT_16: np.dtype("int16"),
    parquet_thrift.ConvertedType.INT_32: np.dtype('int32'),
    parquet_thrift.ConvertedType.INT_64: np.dtype('int64'),
    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
}
nullable = {
    np.dtype('int8'): pd.Int8Dtype(),
    np.dtype('int16'): pd.Int16Dtype(),
    np.dtype('int32'): pd.Int32Dtype(),
    np.dtype('int64'): pd.Int64Dtype(),
    np.dtype('uint8'): pd.UInt8Dtype(),
    np.dtype('uint16'): pd.UInt16Dtype(),
    np.dtype('uint32'): pd.UInt32Dtype(),
    np.dtype('uint64'): pd.UInt64Dtype(),
    np.dtype('bool'): pd.BooleanDtype()
}
pandas_nullable = {
    "Int8": pd.Int8Dtype(),
    "Int16": pd.Int16Dtype(),
    "Int32": pd.Int32Dtype(),
    "Int64": pd.Int64Dtype(),
    "UInt8": pd.UInt8Dtype(),
    "UInt16": pd.UInt16Dtype(),
    "UInt32": pd.UInt32Dtype(),
    "UInt64": pd.UInt64Dtype(),
    "boolean": pd.BooleanDtype()

Beispiel #29

0

Datei anzeigen

    "int64": "Int64",
    "int32": "Int32",
    "int16": "Int16",
    "int8": "Int8",
    "datetime64[D]": "Date",
    "datetime64[ns]": "DateTime",
}

PD2CH = keymap(np.dtype, MAPPING)

PD_INT_TYPES = [
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.Int64Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype(),
]

for typ in PD_INT_TYPES:
    PD2CH[typ] = f"Nullable({typ.name})"

CH2PD = itemmap(reversed, MAPPING)
CH2PD["Null"] = "object"
CH2PD["Nothing"] = "object"

NULLABLE_COLS = [
    "Float64",
    "Float32",

Beispiel #30

0

Datei anzeigen

Datei: test_column.py Projekt: rongou/cudf

        ([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], {}, {},),
    ],
)
def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
    pyarrow_data = pa.array(data, **pyarrow_kwargs)
    cudf_from_pyarrow = as_column(pyarrow_data)
    expected = as_column(data, **cudf_kwargs)
    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]