Python UInt64Dtype Exemples, pandas.UInt64Dtype Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_sidwav.py Projet : anarkiwi/desidulate

    def test_df2wav(self):
        sid = get_sid(pal=True)
        test_wav = os.path.join(self.tmpdir.name, 'test.wav')
        transformer = sox.Transformer()

        for i in range(1, 5):
            test_raw_freq = int(i * 2048)
            test_real_freq = sid.real_sid_freq(test_raw_freq)
            df = pd.DataFrame([{
                'hashid': 1,
                'count': 1,
                'clock': 0,
                'freq1': test_raw_freq,
                'sus1': 15,
                'gate1': 1,
                'tri1': 1,
                'vol': 15
            }, {
                'hashid': 1,
                'count': 1,
                'clock': 1e6 * 10,
                'gate1': 0
            }],
                              dtype=pd.UInt64Dtype()).set_index('clock')
            df = df.fillna(method='ffill').astype(pd.UInt64Dtype())
            write_wav(test_wav, sid, state2samples(df, sid))
            power_df = pd.DataFrame(transformer.power_spectrum(test_wav),
                                    columns=['freq', 'val'])
            val_max = power_df['val'].max()
            freq_max = power_df[power_df['val'] == val_max].iloc[0]['freq']
            freq_diff = abs(freq_max - test_real_freq)
            self.assertLessEqual(freq_diff, 3)

Exemple #2

0

Afficher le fichier

Fichier : dim2.py Projet : Varun270/pandas

    def test_reductions_2d_axis0(self, data, method, request):
        if not hasattr(data, method):
            pytest.skip("test is not applicable for this type/dtype")

        arr2d = data.reshape(1, -1)

        kwargs = {}
        if method == "std":
            # pass ddof=0 so we get all-zero std instead of all-NA std
            kwargs["ddof"] = 0

        try:
            result = getattr(arr2d, method)(axis=0, **kwargs)
        except Exception as err:
            try:
                getattr(data, method)()
            except Exception as err2:
                assert type(err) == type(err2)
                return
            else:
                raise AssertionError("Both reductions should raise or neither")

        if method in ["mean", "median", "sum", "prod"]:
            # std and var are not dtype-preserving
            expected = data
            if method in ["sum", "prod"] and data.dtype.kind in "iub":
                # FIXME: kludge
                if data.dtype.kind in ["i", "b"]:
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.Int32Dtype()
                        else:
                            dtype = pd.Int64Dtype()
                    else:
                        dtype = pd.Int64Dtype()
                elif data.dtype.kind == "u":
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.UInt32Dtype()
                        else:
                            dtype = pd.UInt64Dtype()
                    else:
                        dtype = pd.UInt64Dtype()

                expected = data.astype(dtype)
                if data.dtype.kind == "b" and method in ["sum", "prod"]:
                    # We get IntegerArray instead of BooleanArray
                    pass
                else:
                    assert type(expected) == type(data), type(expected)
                assert dtype == expected.dtype

            self.assert_extension_array_equal(result, expected)
        elif method == "std":
            self.assert_extension_array_equal(result, data - data)

Exemple #3

0

Afficher le fichier

Fichier : _data_types.py Projet : telegit/aws-data-wrangler

def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None

Exemple #4

0

Afficher le fichier

def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }

Exemple #5

0

Afficher le fichier

Fichier : pandas_test.py Projet : eganjs/data-science-types

def test_intdtypes() -> None:
    pd.Int8Dtype()
    pd.Int16Dtype()
    pd.Int32Dtype()
    pd.Int64Dtype()
    pd.UInt8Dtype()
    pd.UInt16Dtype()
    pd.UInt32Dtype()
    pd.UInt64Dtype()

Exemple #6

0

Afficher le fichier

    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()

Exemple #7

0

Afficher le fichier

Fichier : test_ssf.py Projet : anarkiwi/desidulate

 def test_notest_ssf(self):
     df = pd.DataFrame(
         [{'hashid': 1, 'count': 1, 'clock': 0, 'freq1': 1024, 'pwduty1': 0, 'atk1': 0, 'dec1': 0, 'sus1': 15, 'rel1': 0, 'gate1': 1, 'sync1': 0, 'ring1': 0, 'test1': 0, 'tri1': 1, 'saw1': 0, 'pulse1': 0, 'noise1': 0, 'flt1': 0, 'fltres': 0, 'fltcoff': 0, 'fltlo': 0, 'fltband': 0, 'flthi': 0, 'vol': 15},
          {'hashid': 1, 'count': 1, 'clock': 1e5, 'gate1': 0}], dtype=pd.UInt64Dtype())
     s = self._df2ssf(df, percussion=True)
     self.assertEqual(s.waveforms, {'tri'})
     self.assertEqual(s.midi_pitches, (35,))
     self.assertEqual(s.total_duration, 98525)
     self.assertEqual(s.midi_notes, ((0, 35, 98525, 127, 60.134765625),))

Exemple #8

0

Afficher le fichier

Fichier : parquet.py Projet : zjkanjie/pandas

    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()

Exemple #9

0

Afficher le fichier

Fichier : sidlib.py Projet : anarkiwi/desidulate

def set_sid_dtype(df):
    df.dtype = pd.UInt64Dtype()
    for col in df.columns:
        if col.startswith('freq') or col.startswith(
                'pwduty') or col == 'fltcoff':
            col_type = pd.UInt16Dtype()
        elif col[-1].isdigit() or col.startswith('flt'):
            col_type = pd.UInt8Dtype()
        else:
            continue
        df[col] = df[col].astype(col_type)
    return df

Exemple #10

0

Afficher le fichier

 def test_numeric_dtypes(self):
     dtypes = [
         bool,
         np.byte,
         np.ubyte,
         np.short,
         np.ushort,
         np.single,
         np.int32,
         np.intc,
         np.half,
         np.float16,
         np.double,
         np.float64,
         pd.StringDtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
     ]
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".xml", "xml"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         with tmpfile(suffix) as path:
             for dtype in dtypes:
                 try:
                     df = Ind2Col2.convert(Ind2Col2(
                         sample_data_ind2_col2())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise

Exemple #11

0

Afficher le fichier

Fichier : parquet.py Projet : tnir/pandas

    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()

Exemple #12

0

Afficher le fichier

def treenode_df(response: List[List[Any]]) -> pd.DataFrame:
    return lol_to_df(
        response,
        ["id", "parent", "user", "x", "y", "z", "radius", "confidence"],
        [
            np.uint64,
            pd.UInt64Dtype(),
            np.uint64,
            np.float64,
            np.float64,
            np.float64,
            np.float64,
            np.uint8,
        ],
    )

Exemple #13

0

Afficher le fichier

 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise

Exemple #14

0

Afficher le fichier

Fichier : test_dtype.py Projet : cwegrzyn/records-mover

def test_to_pandas_dtype_integer_nullable():
    expectations = {
        (-100, 100): pd.Int8Dtype(),
        (0, 240): pd.UInt8Dtype(),
        (-10000, 10000): pd.Int16Dtype(),
        (500, 40000): pd.UInt16Dtype(),
        (-200000000, 200000000): pd.Int32Dtype(),
        (25, 4000000000): pd.UInt32Dtype(),
        (-9000000000000000000, 2000000000): pd.Int64Dtype(),
        (25, 10000000000000000000): pd.UInt64Dtype(),
        (25, 1000000000000000000000000000): np.float128,
        (None, None): pd.Int64Dtype(),
    }
    for (min_, max_), expected_pandas_type in expectations.items():
        constraints = RecordsSchemaFieldIntegerConstraints(required=True,
                                                           unique=None,
                                                           min_=min_,
                                                           max_=max_)
        yield with_nullable(
            True, check_dtype), "integer", constraints, expected_pandas_type

Exemple #15

0

Afficher le fichier

revmap = {
    parquet_thrift.Type.INT32: np.int32,
    parquet_thrift.Type.INT64: np.int64,
    parquet_thrift.Type.FLOAT: np.float32,
    parquet_thrift.Type.DOUBLE: np.float64
}

pdoptional_to_numpy_typemap = {
    pd.Int8Dtype(): np.int8,
    pd.Int16Dtype(): np.int16,
    pd.Int32Dtype(): np.int32,
    pd.Int64Dtype(): np.int64,
    pd.UInt8Dtype(): np.uint8,
    pd.UInt16Dtype(): np.uint16,
    pd.UInt32Dtype(): np.uint32,
    pd.UInt64Dtype(): np.uint64,
    pd.BooleanDtype(): np.bool
}


def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
    """ Get appropriate typecodes for column dtype

    Data conversion do not happen here, see convert().

    The user is expected to transform their data into the appropriate dtype
    before saving to parquet, we will not make any assumptions for them.

    Known types that cannot be represented (must be first converted another
    type or to raw binary): float128, complex

Exemple #16

0

Afficher le fichier

from dask_sql.java import SqlTypeName

# Default mapping between python types and SQL types
_PYTHON_TO_SQL = {
    np.float64: SqlTypeName.DOUBLE,
    np.float32: SqlTypeName.FLOAT,
    np.int64: SqlTypeName.BIGINT,
    pd.Int64Dtype(): SqlTypeName.BIGINT,
    np.int32: SqlTypeName.INTEGER,
    pd.Int32Dtype(): SqlTypeName.INTEGER,
    np.int16: SqlTypeName.SMALLINT,
    pd.Int16Dtype(): SqlTypeName.SMALLINT,
    np.int8: SqlTypeName.TINYINT,
    pd.Int8Dtype(): SqlTypeName.TINYINT,
    np.uint64: SqlTypeName.BIGINT,
    pd.UInt64Dtype(): SqlTypeName.BIGINT,
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

# Default mapping between SQL types and python types
# for values

Exemple #17

0

Afficher le fichier

Fichier : PandasMagClass.py Projet : l294265421/recommenders

class MicrosoftAcademicGraph:

    # constructor
    def __init__(self, root):
        self.root = root

    # return stream path
    def get_full_path(self, stream_name):
        return self.root + stream_name + ".txt"

    # return stream header
    def get_header(self, stream_name):
        return self.streams[stream_name]

    # return stream types and columns with date
    def get_type(self, stream_name):
        date_columns = []
        schema = {}
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(":")
            nullable = fieldtype.endswith("?")
            if nullable:
                fieldtype = fieldtype[:-1]
            if fieldtype == "DateTime":
                date_columns.append(fieldname)
            schema[fieldname] = self.datatypedict[fieldtype]
        return schema, date_columns

    # return stream columns names
    def get_name(self, stream_name):
        names = []
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(":")
            names.append(fieldname)
        return names

    # return stream Pandas dataFrame
    def get_data_frame(self, stream_name):
        column_name = self.get_name(stream_name)
        column_type, date_columns = self.get_type(stream_name)
        return pd.read_csv(
            filepath_or_buffer=self.get_full_path(stream_name),
            parse_dates=date_columns,
            low_memory=False,
            names=column_name,
            dtype=column_type,
            date_parser=self.date_parse_func,
            sep="\t",
        )

    # date parse function
    date_parse_func = lambda self, c: pd.to_datetime(
        c, format="%m/%d/%Y %H:%M:%S %p", errors="coerce"
    )  # 6/24/2016 12:00:00 AM

    # convert input datatype to Pandas datatype
    datatypedict = {
        "int": pd.Int32Dtype(),
        "uint": pd.UInt32Dtype(),
        "long": pd.Int64Dtype(),
        "ulong": pd.UInt64Dtype(),
        "float": np.float32,
        "string": np.string_,
        "DateTime": np.string_,
    }

    # define stream dictionary
    streams = {
        "Affiliations": [
            "AffiliationId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "GridId:string",
            "OfficialPage:string",
            "WikiPage:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "Latitude:float?",
            "Longitude:float?",
            "CreatedDate:DateTime",
        ],
        "Authors": [
            "AuthorId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "LastKnownAffiliationId:long?",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "ConferenceInstances": [
            "ConferenceInstanceId:long",
            "NormalizedName:string",
            "DisplayName:string",
            "ConferenceSeriesId:long",
            "Location:string",
            "OfficialUrl:string",
            "StartDate:DateTime?",
            "EndDate:DateTime?",
            "AbstractRegistrationDate:DateTime?",
            "SubmissionDeadlineDate:DateTime?",
            "NotificationDueDate:DateTime?",
            "FinalVersionDueDate:DateTime?",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "Latitude:float?",
            "Longitude:float?",
            "CreatedDate:DateTime",
        ],
        "ConferenceSeries": [
            "ConferenceSeriesId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "EntityRelatedEntities": [
            "EntityId:long",
            "EntityType:string",
            "RelatedEntityId:long",
            "RelatedEntityType:string",
            "RelatedType:int",
            "Score:float",
        ],
        "FieldOfStudyChildren":
        ["FieldOfStudyId:long", "ChildFieldOfStudyId:long"],
        "FieldOfStudyExtendedAttributes": [
            "FieldOfStudyId:long",
            "AttributeType:int",
            "AttributeValue:string",
        ],
        "FieldsOfStudy": [
            "FieldOfStudyId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "MainType:string",
            "Level:int",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "Journals": [
            "JournalId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "Issn:string",
            "Publisher:string",
            "Webpage:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "PaperAbstractsInvertedIndex":
        ["PaperId:long", "IndexedAbstract:string"],
        "PaperAuthorAffiliations": [
            "PaperId:long",
            "AuthorId:long",
            "AffiliationId:long?",
            "AuthorSequenceNumber:uint",
            "OriginalAuthor:string",
            "OriginalAffiliation:string",
        ],
        "PaperCitationContexts": [
            "PaperId:long",
            "PaperReferenceId:long",
            "CitationContext:string",
        ],
        "PaperExtendedAttributes": [
            "PaperId:long",
            "AttributeType:int",
            "AttributeValue:string",
        ],
        "PaperFieldsOfStudy":
        ["PaperId:long", "FieldOfStudyId:long", "Score:float"],
        "PaperRecommendations": [
            "PaperId:long",
            "RecommendedPaperId:long",
            "Score:float",
        ],
        "PaperReferences": ["PaperId:long", "PaperReferenceId:long"],
        "PaperResources": [
            "PaperId:long",
            "ResourceType:int",
            "ResourceUrl:string",
            "SourceUrl:string",
            "RelationshipType:int",
        ],
        "PaperUrls": [
            "PaperId:long",
            "SourceType:int?",
            "SourceUrl:string",
            "LanguageCode:string",
        ],
        "Papers": [
            "PaperId:long",
            "Rank:uint",
            "Doi:string",
            "DocType:string",
            "PaperTitle:string",
            "OriginalTitle:string",
            "BookTitle:string",
            "Year:int?",
            "Date:DateTime?",
            "OnlineDate:DateTime?",
            "Publisher:string",
            "JournalId:long?",
            "ConferenceSeriesId:long?",
            "ConferenceInstanceId:long?",
            "Volume:string",
            "Issue:string",
            "FirstPage:string",
            "LastPage:string",
            "ReferenceCount:long",
            "CitationCount:long",
            "EstimatedCitation:long",
            "OriginalVenue:string",
            "FamilyId:long?",
            "CreatedDate:DateTime",
        ],
        "RelatedFieldOfStudy": [
            "FieldOfStudyId1:long",
            "Type1:string",
            "FieldOfStudyId2:long",
            "Type2:string",
            "Rank:float",
        ],
    }

Exemple #18

0

Afficher le fichier

Fichier : PandasMagClass.py Projet : LiMa0326/mag-analysis-examples

class MicrosoftAcademicGraph():

    # constructor
    def __init__(self, root):
        self.root = root

    # return stream path
    def get_full_path(self, stream_name):
        return self.root + stream_name + '.txt'

    # return stream header
    def get_header(self, stream_name):
        return self.streams[stream_name]

    # return stream types and columns with date
    def get_type(self, stream_name):
        date_columns = []
        schema = {}
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(':')
            nullable = fieldtype.endswith('?')
            if nullable:
                fieldtype = fieldtype[:-1]
            if fieldtype == 'DateTime':
                date_columns.append(fieldname)
            schema[fieldname] = self.datatypedict[fieldtype]
        return schema, date_columns

    # return stream columns names
    def get_name(self, stream_name):
        names = []
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(':')
            names.append(fieldname)
        return names

    # return stream Pandas dataFrame
    def get_data_frame(self, stream_name):
        column_name = self.get_name(stream_name)
        column_type, date_columns = self.get_type(stream_name)
        return pd.read_csv(filepath_or_buffer=self.get_full_path(stream_name),
                           parse_dates=date_columns,
                           low_memory=False,
                           names=column_name,
                           dtype=column_type,
                           date_parser=self.date_parse_func,
                           sep='\t')

    # date parse function
    date_parse_func = lambda self, c: pd.to_datetime(
        c, format='%m/%d/%Y %H:%M:%S %p', errors='coerce'
    )  #6/24/2016 12:00:00 AM

    # convert input datatype to Pandas datatype
    datatypedict = {
        'int': pd.Int32Dtype(),
        'uint': pd.UInt32Dtype(),
        'long': pd.Int64Dtype(),
        'ulong': pd.UInt64Dtype(),
        'float': np.float32,
        'string': np.string_,
        'DateTime': np.string_,
    }

    # define stream dictionary
    streams = {
        'Affiliations': [
            'AffiliationId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'GridId:string', 'OfficialPage:string',
            'WikiPage:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'Latitude:float?', 'Longitude:float?',
            'CreatedDate:DateTime'
        ],
        'Authors': [
            'AuthorId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'LastKnownAffiliationId:long?',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'CreatedDate:DateTime'
        ],
        'ConferenceInstances': [
            'ConferenceInstanceId:long', 'NormalizedName:string',
            'DisplayName:string', 'ConferenceSeriesId:long', 'Location:string',
            'OfficialUrl:string', 'StartDate:DateTime?', 'EndDate:DateTime?',
            'AbstractRegistrationDate:DateTime?',
            'SubmissionDeadlineDate:DateTime?',
            'NotificationDueDate:DateTime?', 'FinalVersionDueDate:DateTime?',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'Latitude:float?', 'Longitude:float?', 'CreatedDate:DateTime'
        ],
        'ConferenceSeries': [
            'ConferenceSeriesId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'CreatedDate:DateTime'
        ],
        'EntityRelatedEntities': [
            'EntityId:long', 'EntityType:string', 'RelatedEntityId:long',
            'RelatedEntityType:string', 'RelatedType:int', 'Score:float'
        ],
        'FieldOfStudyChildren':
        ['FieldOfStudyId:long', 'ChildFieldOfStudyId:long'],
        'FieldOfStudyExtendedAttributes':
        ['FieldOfStudyId:long', 'AttributeType:int', 'AttributeValue:string'],
        'FieldsOfStudy': [
            'FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'MainType:string', 'Level:int',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'CreatedDate:DateTime'
        ],
        'Journals': [
            'JournalId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'Issn:string', 'Publisher:string',
            'Webpage:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'CreatedDate:DateTime'
        ],
        'PaperAbstractsInvertedIndex':
        ['PaperId:long', 'IndexedAbstract:string'],
        'PaperAuthorAffiliations': [
            'PaperId:long', 'AuthorId:long', 'AffiliationId:long?',
            'AuthorSequenceNumber:uint', 'OriginalAuthor:string',
            'OriginalAffiliation:string'
        ],
        'PaperCitationContexts':
        ['PaperId:long', 'PaperReferenceId:long', 'CitationContext:string'],
        'PaperExtendedAttributes':
        ['PaperId:long', 'AttributeType:int', 'AttributeValue:string'],
        'PaperFieldsOfStudy':
        ['PaperId:long', 'FieldOfStudyId:long', 'Score:float'],
        'PaperRecommendations':
        ['PaperId:long', 'RecommendedPaperId:long', 'Score:float'],
        'PaperReferences': ['PaperId:long', 'PaperReferenceId:long'],
        'PaperResources': [
            'PaperId:long', 'ResourceType:int', 'ResourceUrl:string',
            'SourceUrl:string', 'RelationshipType:int'
        ],
        'PaperUrls': [
            'PaperId:long', 'SourceType:int?', 'SourceUrl:string',
            'LanguageCode:string'
        ],
        'Papers': [
            'PaperId:long', 'Rank:uint', 'Doi:string', 'DocType:string',
            'PaperTitle:string', 'OriginalTitle:string', 'BookTitle:string',
            'Year:int?', 'Date:DateTime?', 'OnlineDate:DateTime?',
            'Publisher:string', 'JournalId:long?', 'ConferenceSeriesId:long?',
            'ConferenceInstanceId:long?', 'Volume:string', 'Issue:string',
            'FirstPage:string', 'LastPage:string', 'ReferenceCount:long',
            'CitationCount:long', 'EstimatedCitation:long',
            'OriginalVenue:string', 'FamilyId:long?', 'CreatedDate:DateTime'
        ],
        'RelatedFieldOfStudy': [
            'FieldOfStudyId1:long', 'Type1:string', 'FieldOfStudyId2:long',
            'Type2:string', 'Rank:float'
        ],
    }

Exemple #19

0

Afficher le fichier

Fichier : dtypes.py Projet : mikest18/cudf

    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),

Exemple #20

0

Afficher le fichier

Fichier : test_series.py Projet : miguelusque/cudf

 (
     cudf.Series([1, 2, None, 3], dtype="uint8"),
     pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
 ),
 (
     cudf.Series([23, None, None, 32], dtype="uint16"),
     pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
 ),
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series([234, 2323, 23432, None, None, 224],
               dtype=pd.UInt64Dtype()),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series([11, None, 22, 33, None, 2, None, 3],
               dtype=pd.Int32Dtype()),
 ),
 (

Exemple #21

0

Afficher le fichier

Fichier : results.py Projet : filipbartek/vampire-ml

def generate_problems_df(runs_df,
                         probe_runs_df=None,
                         problem_paths=None,
                         problem_base_path=None,
                         custom_runs_df=None):
    if problem_paths is None:
        problem_paths = runs_df.problem_path
    elif problem_base_path is not None:
        problem_paths = [
            os.path.relpath(problem_path, problem_base_path)
            for problem_path in problem_paths
        ]
    problem_paths = pd.Index(problem_paths).drop_duplicates()
    problems_df = pd.DataFrame(index=problem_paths)
    problems_df.index.name = 'problem_path'
    # Merge probe run results into `problems_df`
    if probe_runs_df is not None:
        problems_df = problems_df.join(probe_runs_df[[
            'problem_path', 'predicates_count', 'functions_count',
            'clauses_count'
        ]].drop_duplicates('problem_path').set_index('problem_path'),
                                       rsuffix='probe')
    # Random solve run stats
    problem_groups = runs_df.groupby(['problem_path'])
    problems_df = problems_df.join(problem_groups.size().astype(
        pd.UInt64Dtype()).to_frame('n_total'))
    problems_df = problems_df.join(
        runs_df[runs_df.status == 'completed'].groupby([
            'problem_path'
        ]).size().astype(pd.UInt64Dtype()).to_frame('n_completed'))
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_0'))
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 1].groupby(
        ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_1'))
    if 'termination_reason' in runs_df:
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Refutation'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_refutation'))
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Satisfiable'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_satisfiable'))
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Time limit'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_time_limit'))
    problems_df.fillna(
        {
            'n_total': 0,
            'n_completed': 0,
            'n_exit_0': 0,
            'n_exit_1': 0,
            'n_refutation': 0,
            'n_satisfiable': 0,
            'n_time_limit': 0
        },
        inplace=True)

    def variation(a):
        if (a == 0).all():
            # We need to handle this special case explicitly because `scipy.stats.variation` raises an exception on it.
            return 0
        res = scipy.stats.variation(a.astype(np.float), nan_policy='omit')
        if isinstance(res, np.ma.core.MaskedConstant):
            # The input array contains all nans.
            return np.nan
        return res

    agg_functions = [np.mean, np.std, variation, np.min, np.max]
    # Aggregate time measurements across successful runs
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).agg({
            field_name: agg_functions
            for field_name in [
                'time_elapsed_process', 'time_elapsed_vampire',
                'saturation_iterations'
            ]
        }))
    # Count unique numbers of saturation iterations across successful runs
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).agg({
            'saturation_iterations': ['nunique']
        }).astype(pd.UInt64Dtype()))
    # Aggregate memory across all runs
    problems_df = problems_df.join(
        runs_df.groupby(['problem_path']).agg({'memory_used': agg_functions}))
    if custom_runs_df is not None:
        for name, value in custom_runs_df.groupby(['name']):
            value = value.set_index('problem_path')
            value = value[['exit_code', 'saturation_iterations']]
            # https://stackoverflow.com/a/40225796/4054250
            value.columns = pd.MultiIndex.from_product([[name], value.columns])
            problems_df = problems_df.join(value, rsuffix=name)
    problems_df.sort_index(inplace=True)
    return problems_df

Exemple #22

0

Afficher le fichier

    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {

Exemple #23

0

Afficher le fichier

    parquet_thrift.ConvertedType.INT_64: np.dtype('int64'),
    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
}
nullable = {
    np.dtype('int8'): pd.Int8Dtype(),
    np.dtype('int16'): pd.Int16Dtype(),
    np.dtype('int32'): pd.Int32Dtype(),
    np.dtype('int64'): pd.Int64Dtype(),
    np.dtype('uint8'): pd.UInt8Dtype(),
    np.dtype('uint16'): pd.UInt16Dtype(),
    np.dtype('uint32'): pd.UInt32Dtype(),
    np.dtype('uint64'): pd.UInt64Dtype(),
    np.dtype('bool'): pd.BooleanDtype()
}
pandas_nullable = {
    "Int8": pd.Int8Dtype(),
    "Int16": pd.Int16Dtype(),
    "Int32": pd.Int32Dtype(),
    "Int64": pd.Int64Dtype(),
    "UInt8": pd.UInt8Dtype(),
    "UInt16": pd.UInt16Dtype(),
    "UInt32": pd.UInt32Dtype(),
    "UInt64": pd.UInt64Dtype(),
    "boolean": pd.BooleanDtype()
}

Exemple #24

0

Afficher le fichier

Fichier : test_column.py Projet : rongou/cudf

def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
    pyarrow_data = pa.array(data, **pyarrow_kwargs)
    cudf_from_pyarrow = as_column(pyarrow_data)
    expected = as_column(data, **cudf_kwargs)
    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

Exemple #25

0

Afficher le fichier

Fichier : test_series.py Projet : zkh2016/cudf

 (
     cudf.Series([1, 2, None, 3], dtype="uint8"),
     pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
 ),
 (
     cudf.Series([23, None, None, 32], dtype="uint16"),
     pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
 ),
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series(
         [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()
     ),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series(
         [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()
     ),

Exemple #26

0

Afficher le fichier

Fichier : _pyreadr_writer.py Projet : ty1539/pyreadr

import datetime

import numpy as np
import pandas as pd

from .librdata import Writer
from .custom_errors import PyreadrError


# configuration

int_types = {np.dtype('int32'), np.dtype('int16'), np.dtype('int8'), np.dtype('uint8'), np.dtype('uint16'),
             np.int32, np.int16, np.int8, np.uint8, np.uint16}
int_mixed_types = {pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype()}
float_types = {np.dtype('int64'), np.dtype('uint64'), np.dtype('uint32'), np.dtype('float'),
               np.int64, np.uint64, np.uint32, np.float, pd.Int64Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype()}
datetime_types = {datetime.datetime, np.datetime64}

pyreadr_to_librdata_types = {"INTEGER": "INTEGER", "NUMERIC": "NUMERIC",
                        "LOGICAL": "LOGICAL", "CHARACTER": "CHARACTER",
                        "OBJECT": "CHARACTER", "DATE": "CHARACTER",
                        "DATETIME":"CHARACTER"}
                        
librdata_min_integer = -2147483648


def get_pyreadr_column_types(df):
    """
    From a pandas data frame, get an OrderedDict with column name as key
    and pyreadr column type as value, and also a list with boolean 
    values indicating if the column has missing values (np.nan).

Exemple #27

0

Afficher le fichier

Fichier : pandas_engine.py Projet : pandera-dev/pandera

    type = pd.Int8Dtype()
    bit_width: int = 8


###############################################################################
# unsigned integer
###############################################################################

_register_numpy_numbers(
    builtin_name="uint",
    pandera_name="UInt",
    sizes=[64, 32, 16, 8],
)


@Engine.register_dtype(equivalents=[pd.UInt64Dtype, pd.UInt64Dtype()])
@immutable
class UINT64(DataType, dtypes.UInt):
    """Semantic representation of a :class:`pandas.UInt64Dtype`."""

    type = pd.UInt64Dtype()
    bit_width: int = 64


@Engine.register_dtype(equivalents=[pd.UInt32Dtype, pd.UInt32Dtype()])
@immutable
class UINT32(UINT64):
    """Semantic representation of a :class:`pandas.UInt32Dtype`."""

    type = pd.UInt32Dtype()
    bit_width: int = 32

Exemple #28

0

Afficher le fichier

Fichier : pandas_engine.py Projet : pandera-dev/pandera

class UINT64(DataType, dtypes.UInt):
    """Semantic representation of a :class:`pandas.UInt64Dtype`."""

    type = pd.UInt64Dtype()
    bit_width: int = 64

Exemple #29

0

Afficher le fichier

Fichier : utils.py Projet : wphicks/cudf

# Copyright (c) 2020, NVIDIA CORPORATION.

import random

import pandas as pd
import pyarrow as pa

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}


def _generate_rand_meta(obj, dtypes_list):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):
        dtype = random.choice(dtypes_list)
        null_frequency = random.uniform(0, 1)

Exemple #30

0

Afficher le fichier

    "int8": "Int8",
    "datetime64[D]": "Date",
    "datetime64[ns]": "DateTime",
}

PD2CH = keymap(np.dtype, MAPPING)

PD_INT_TYPES = [
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.Int64Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype(),
]

for typ in PD_INT_TYPES:
    PD2CH[typ] = f"Nullable({typ.name})"

CH2PD = itemmap(reversed, MAPPING)
CH2PD["Null"] = "object"
CH2PD["Nothing"] = "object"

NULLABLE_COLS = [
    "Float64",
    "Float32",
    "String",
]