コード例 #1
0
ファイル: test_sidwav.py プロジェクト: anarkiwi/desidulate
    def test_df2wav(self):
        sid = get_sid(pal=True)
        test_wav = os.path.join(self.tmpdir.name, 'test.wav')
        transformer = sox.Transformer()

        for i in range(1, 5):
            test_raw_freq = int(i * 2048)
            test_real_freq = sid.real_sid_freq(test_raw_freq)
            df = pd.DataFrame([{
                'hashid': 1,
                'count': 1,
                'clock': 0,
                'freq1': test_raw_freq,
                'sus1': 15,
                'gate1': 1,
                'tri1': 1,
                'vol': 15
            }, {
                'hashid': 1,
                'count': 1,
                'clock': 1e6 * 10,
                'gate1': 0
            }],
                              dtype=pd.UInt64Dtype()).set_index('clock')
            df = df.fillna(method='ffill').astype(pd.UInt64Dtype())
            write_wav(test_wav, sid, state2samples(df, sid))
            power_df = pd.DataFrame(transformer.power_spectrum(test_wav),
                                    columns=['freq', 'val'])
            val_max = power_df['val'].max()
            freq_max = power_df[power_df['val'] == val_max].iloc[0]['freq']
            freq_diff = abs(freq_max - test_real_freq)
            self.assertLessEqual(freq_diff, 3)
コード例 #2
0
ファイル: dim2.py プロジェクト: Varun270/pandas
    def test_reductions_2d_axis0(self, data, method, request):
        if not hasattr(data, method):
            pytest.skip("test is not applicable for this type/dtype")

        arr2d = data.reshape(1, -1)

        kwargs = {}
        if method == "std":
            # pass ddof=0 so we get all-zero std instead of all-NA std
            kwargs["ddof"] = 0

        try:
            result = getattr(arr2d, method)(axis=0, **kwargs)
        except Exception as err:
            try:
                getattr(data, method)()
            except Exception as err2:
                assert type(err) == type(err2)
                return
            else:
                raise AssertionError("Both reductions should raise or neither")

        if method in ["mean", "median", "sum", "prod"]:
            # std and var are not dtype-preserving
            expected = data
            if method in ["sum", "prod"] and data.dtype.kind in "iub":
                # FIXME: kludge
                if data.dtype.kind in ["i", "b"]:
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.Int32Dtype()
                        else:
                            dtype = pd.Int64Dtype()
                    else:
                        dtype = pd.Int64Dtype()
                elif data.dtype.kind == "u":
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.UInt32Dtype()
                        else:
                            dtype = pd.UInt64Dtype()
                    else:
                        dtype = pd.UInt64Dtype()

                expected = data.astype(dtype)
                if data.dtype.kind == "b" and method in ["sum", "prod"]:
                    # We get IntegerArray instead of BooleanArray
                    pass
                else:
                    assert type(expected) == type(data), type(expected)
                assert dtype == expected.dtype

            self.assert_extension_array_equal(result, expected)
        elif method == "std":
            self.assert_extension_array_equal(result, data - data)
コード例 #3
0
def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
コード例 #4
0
def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }
コード例 #5
0
def test_intdtypes() -> None:
    pd.Int8Dtype()
    pd.Int16Dtype()
    pd.Int32Dtype()
    pd.Int64Dtype()
    pd.UInt8Dtype()
    pd.UInt16Dtype()
    pd.UInt32Dtype()
    pd.UInt64Dtype()
コード例 #6
0
    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
コード例 #7
0
ファイル: test_ssf.py プロジェクト: anarkiwi/desidulate
 def test_notest_ssf(self):
     df = pd.DataFrame(
         [{'hashid': 1, 'count': 1, 'clock': 0, 'freq1': 1024, 'pwduty1': 0, 'atk1': 0, 'dec1': 0, 'sus1': 15, 'rel1': 0, 'gate1': 1, 'sync1': 0, 'ring1': 0, 'test1': 0, 'tri1': 1, 'saw1': 0, 'pulse1': 0, 'noise1': 0, 'flt1': 0, 'fltres': 0, 'fltcoff': 0, 'fltlo': 0, 'fltband': 0, 'flthi': 0, 'vol': 15},
          {'hashid': 1, 'count': 1, 'clock': 1e5, 'gate1': 0}], dtype=pd.UInt64Dtype())
     s = self._df2ssf(df, percussion=True)
     self.assertEqual(s.waveforms, {'tri'})
     self.assertEqual(s.midi_pitches, (35,))
     self.assertEqual(s.total_duration, 98525)
     self.assertEqual(s.midi_notes, ((0, 35, 98525, 127, 60.134765625),))
コード例 #8
0
ファイル: parquet.py プロジェクト: zjkanjie/pandas
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
コード例 #9
0
ファイル: sidlib.py プロジェクト: anarkiwi/desidulate
def set_sid_dtype(df):
    df.dtype = pd.UInt64Dtype()
    for col in df.columns:
        if col.startswith('freq') or col.startswith(
                'pwduty') or col == 'fltcoff':
            col_type = pd.UInt16Dtype()
        elif col[-1].isdigit() or col.startswith('flt'):
            col_type = pd.UInt8Dtype()
        else:
            continue
        df[col] = df[col].astype(col_type)
    return df
コード例 #10
0
 def test_numeric_dtypes(self):
     dtypes = [
         bool,
         np.byte,
         np.ubyte,
         np.short,
         np.ushort,
         np.single,
         np.int32,
         np.intc,
         np.half,
         np.float16,
         np.double,
         np.float64,
         pd.StringDtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
     ]
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".xml", "xml"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         with tmpfile(suffix) as path:
             for dtype in dtypes:
                 try:
                     df = Ind2Col2.convert(Ind2Col2(
                         sample_data_ind2_col2())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
コード例 #11
0
ファイル: parquet.py プロジェクト: tnir/pandas
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
コード例 #12
0
def treenode_df(response: List[List[Any]]) -> pd.DataFrame:
    return lol_to_df(
        response,
        ["id", "parent", "user", "x", "y", "z", "radius", "confidence"],
        [
            np.uint64,
            pd.UInt64Dtype(),
            np.uint64,
            np.float64,
            np.float64,
            np.float64,
            np.float64,
            np.uint8,
        ],
    )
コード例 #13
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
コード例 #14
0
ファイル: test_dtype.py プロジェクト: cwegrzyn/records-mover
def test_to_pandas_dtype_integer_nullable():
    expectations = {
        (-100, 100): pd.Int8Dtype(),
        (0, 240): pd.UInt8Dtype(),
        (-10000, 10000): pd.Int16Dtype(),
        (500, 40000): pd.UInt16Dtype(),
        (-200000000, 200000000): pd.Int32Dtype(),
        (25, 4000000000): pd.UInt32Dtype(),
        (-9000000000000000000, 2000000000): pd.Int64Dtype(),
        (25, 10000000000000000000): pd.UInt64Dtype(),
        (25, 1000000000000000000000000000): np.float128,
        (None, None): pd.Int64Dtype(),
    }
    for (min_, max_), expected_pandas_type in expectations.items():
        constraints = RecordsSchemaFieldIntegerConstraints(required=True,
                                                           unique=None,
                                                           min_=min_,
                                                           max_=max_)
        yield with_nullable(
            True, check_dtype), "integer", constraints, expected_pandas_type
コード例 #15
0
revmap = {
    parquet_thrift.Type.INT32: np.int32,
    parquet_thrift.Type.INT64: np.int64,
    parquet_thrift.Type.FLOAT: np.float32,
    parquet_thrift.Type.DOUBLE: np.float64
}

pdoptional_to_numpy_typemap = {
    pd.Int8Dtype(): np.int8,
    pd.Int16Dtype(): np.int16,
    pd.Int32Dtype(): np.int32,
    pd.Int64Dtype(): np.int64,
    pd.UInt8Dtype(): np.uint8,
    pd.UInt16Dtype(): np.uint16,
    pd.UInt32Dtype(): np.uint32,
    pd.UInt64Dtype(): np.uint64,
    pd.BooleanDtype(): np.bool
}


def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
    """ Get appropriate typecodes for column dtype

    Data conversion do not happen here, see convert().

    The user is expected to transform their data into the appropriate dtype
    before saving to parquet, we will not make any assumptions for them.

    Known types that cannot be represented (must be first converted another
    type or to raw binary): float128, complex
コード例 #16
0
from dask_sql.java import SqlTypeName

# Default mapping between python types and SQL types
_PYTHON_TO_SQL = {
    np.float64: SqlTypeName.DOUBLE,
    np.float32: SqlTypeName.FLOAT,
    np.int64: SqlTypeName.BIGINT,
    pd.Int64Dtype(): SqlTypeName.BIGINT,
    np.int32: SqlTypeName.INTEGER,
    pd.Int32Dtype(): SqlTypeName.INTEGER,
    np.int16: SqlTypeName.SMALLINT,
    pd.Int16Dtype(): SqlTypeName.SMALLINT,
    np.int8: SqlTypeName.TINYINT,
    pd.Int8Dtype(): SqlTypeName.TINYINT,
    np.uint64: SqlTypeName.BIGINT,
    pd.UInt64Dtype(): SqlTypeName.BIGINT,
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

# Default mapping between SQL types and python types
# for values
コード例 #17
0
class MicrosoftAcademicGraph:

    # constructor
    def __init__(self, root):
        self.root = root

    # return stream path
    def get_full_path(self, stream_name):
        return self.root + stream_name + ".txt"

    # return stream header
    def get_header(self, stream_name):
        return self.streams[stream_name]

    # return stream types and columns with date
    def get_type(self, stream_name):
        date_columns = []
        schema = {}
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(":")
            nullable = fieldtype.endswith("?")
            if nullable:
                fieldtype = fieldtype[:-1]
            if fieldtype == "DateTime":
                date_columns.append(fieldname)
            schema[fieldname] = self.datatypedict[fieldtype]
        return schema, date_columns

    # return stream columns names
    def get_name(self, stream_name):
        names = []
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(":")
            names.append(fieldname)
        return names

    # return stream Pandas dataFrame
    def get_data_frame(self, stream_name):
        column_name = self.get_name(stream_name)
        column_type, date_columns = self.get_type(stream_name)
        return pd.read_csv(
            filepath_or_buffer=self.get_full_path(stream_name),
            parse_dates=date_columns,
            low_memory=False,
            names=column_name,
            dtype=column_type,
            date_parser=self.date_parse_func,
            sep="\t",
        )

    # date parse function
    date_parse_func = lambda self, c: pd.to_datetime(
        c, format="%m/%d/%Y %H:%M:%S %p", errors="coerce"
    )  # 6/24/2016 12:00:00 AM

    # convert input datatype to Pandas datatype
    datatypedict = {
        "int": pd.Int32Dtype(),
        "uint": pd.UInt32Dtype(),
        "long": pd.Int64Dtype(),
        "ulong": pd.UInt64Dtype(),
        "float": np.float32,
        "string": np.string_,
        "DateTime": np.string_,
    }

    # define stream dictionary
    streams = {
        "Affiliations": [
            "AffiliationId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "GridId:string",
            "OfficialPage:string",
            "WikiPage:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "Latitude:float?",
            "Longitude:float?",
            "CreatedDate:DateTime",
        ],
        "Authors": [
            "AuthorId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "LastKnownAffiliationId:long?",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "ConferenceInstances": [
            "ConferenceInstanceId:long",
            "NormalizedName:string",
            "DisplayName:string",
            "ConferenceSeriesId:long",
            "Location:string",
            "OfficialUrl:string",
            "StartDate:DateTime?",
            "EndDate:DateTime?",
            "AbstractRegistrationDate:DateTime?",
            "SubmissionDeadlineDate:DateTime?",
            "NotificationDueDate:DateTime?",
            "FinalVersionDueDate:DateTime?",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "Latitude:float?",
            "Longitude:float?",
            "CreatedDate:DateTime",
        ],
        "ConferenceSeries": [
            "ConferenceSeriesId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "EntityRelatedEntities": [
            "EntityId:long",
            "EntityType:string",
            "RelatedEntityId:long",
            "RelatedEntityType:string",
            "RelatedType:int",
            "Score:float",
        ],
        "FieldOfStudyChildren":
        ["FieldOfStudyId:long", "ChildFieldOfStudyId:long"],
        "FieldOfStudyExtendedAttributes": [
            "FieldOfStudyId:long",
            "AttributeType:int",
            "AttributeValue:string",
        ],
        "FieldsOfStudy": [
            "FieldOfStudyId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "MainType:string",
            "Level:int",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "Journals": [
            "JournalId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "Issn:string",
            "Publisher:string",
            "Webpage:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "PaperAbstractsInvertedIndex":
        ["PaperId:long", "IndexedAbstract:string"],
        "PaperAuthorAffiliations": [
            "PaperId:long",
            "AuthorId:long",
            "AffiliationId:long?",
            "AuthorSequenceNumber:uint",
            "OriginalAuthor:string",
            "OriginalAffiliation:string",
        ],
        "PaperCitationContexts": [
            "PaperId:long",
            "PaperReferenceId:long",
            "CitationContext:string",
        ],
        "PaperExtendedAttributes": [
            "PaperId:long",
            "AttributeType:int",
            "AttributeValue:string",
        ],
        "PaperFieldsOfStudy":
        ["PaperId:long", "FieldOfStudyId:long", "Score:float"],
        "PaperRecommendations": [
            "PaperId:long",
            "RecommendedPaperId:long",
            "Score:float",
        ],
        "PaperReferences": ["PaperId:long", "PaperReferenceId:long"],
        "PaperResources": [
            "PaperId:long",
            "ResourceType:int",
            "ResourceUrl:string",
            "SourceUrl:string",
            "RelationshipType:int",
        ],
        "PaperUrls": [
            "PaperId:long",
            "SourceType:int?",
            "SourceUrl:string",
            "LanguageCode:string",
        ],
        "Papers": [
            "PaperId:long",
            "Rank:uint",
            "Doi:string",
            "DocType:string",
            "PaperTitle:string",
            "OriginalTitle:string",
            "BookTitle:string",
            "Year:int?",
            "Date:DateTime?",
            "OnlineDate:DateTime?",
            "Publisher:string",
            "JournalId:long?",
            "ConferenceSeriesId:long?",
            "ConferenceInstanceId:long?",
            "Volume:string",
            "Issue:string",
            "FirstPage:string",
            "LastPage:string",
            "ReferenceCount:long",
            "CitationCount:long",
            "EstimatedCitation:long",
            "OriginalVenue:string",
            "FamilyId:long?",
            "CreatedDate:DateTime",
        ],
        "RelatedFieldOfStudy": [
            "FieldOfStudyId1:long",
            "Type1:string",
            "FieldOfStudyId2:long",
            "Type2:string",
            "Rank:float",
        ],
    }
コード例 #18
0
class MicrosoftAcademicGraph():

    # constructor
    def __init__(self, root):
        self.root = root

    # return stream path
    def get_full_path(self, stream_name):
        return self.root + stream_name + '.txt'

    # return stream header
    def get_header(self, stream_name):
        return self.streams[stream_name]

    # return stream types and columns with date
    def get_type(self, stream_name):
        date_columns = []
        schema = {}
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(':')
            nullable = fieldtype.endswith('?')
            if nullable:
                fieldtype = fieldtype[:-1]
            if fieldtype == 'DateTime':
                date_columns.append(fieldname)
            schema[fieldname] = self.datatypedict[fieldtype]
        return schema, date_columns

    # return stream columns names
    def get_name(self, stream_name):
        names = []
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(':')
            names.append(fieldname)
        return names

    # return stream Pandas dataFrame
    def get_data_frame(self, stream_name):
        column_name = self.get_name(stream_name)
        column_type, date_columns = self.get_type(stream_name)
        return pd.read_csv(filepath_or_buffer=self.get_full_path(stream_name),
                           parse_dates=date_columns,
                           low_memory=False,
                           names=column_name,
                           dtype=column_type,
                           date_parser=self.date_parse_func,
                           sep='\t')

    # date parse function
    date_parse_func = lambda self, c: pd.to_datetime(
        c, format='%m/%d/%Y %H:%M:%S %p', errors='coerce'
    )  #6/24/2016 12:00:00 AM

    # convert input datatype to Pandas datatype
    datatypedict = {
        'int': pd.Int32Dtype(),
        'uint': pd.UInt32Dtype(),
        'long': pd.Int64Dtype(),
        'ulong': pd.UInt64Dtype(),
        'float': np.float32,
        'string': np.string_,
        'DateTime': np.string_,
    }

    # define stream dictionary
    streams = {
        'Affiliations': [
            'AffiliationId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'GridId:string', 'OfficialPage:string',
            'WikiPage:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'Latitude:float?', 'Longitude:float?',
            'CreatedDate:DateTime'
        ],
        'Authors': [
            'AuthorId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'LastKnownAffiliationId:long?',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'CreatedDate:DateTime'
        ],
        'ConferenceInstances': [
            'ConferenceInstanceId:long', 'NormalizedName:string',
            'DisplayName:string', 'ConferenceSeriesId:long', 'Location:string',
            'OfficialUrl:string', 'StartDate:DateTime?', 'EndDate:DateTime?',
            'AbstractRegistrationDate:DateTime?',
            'SubmissionDeadlineDate:DateTime?',
            'NotificationDueDate:DateTime?', 'FinalVersionDueDate:DateTime?',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'Latitude:float?', 'Longitude:float?', 'CreatedDate:DateTime'
        ],
        'ConferenceSeries': [
            'ConferenceSeriesId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'CreatedDate:DateTime'
        ],
        'EntityRelatedEntities': [
            'EntityId:long', 'EntityType:string', 'RelatedEntityId:long',
            'RelatedEntityType:string', 'RelatedType:int', 'Score:float'
        ],
        'FieldOfStudyChildren':
        ['FieldOfStudyId:long', 'ChildFieldOfStudyId:long'],
        'FieldOfStudyExtendedAttributes':
        ['FieldOfStudyId:long', 'AttributeType:int', 'AttributeValue:string'],
        'FieldsOfStudy': [
            'FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'MainType:string', 'Level:int',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'CreatedDate:DateTime'
        ],
        'Journals': [
            'JournalId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'Issn:string', 'Publisher:string',
            'Webpage:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'CreatedDate:DateTime'
        ],
        'PaperAbstractsInvertedIndex':
        ['PaperId:long', 'IndexedAbstract:string'],
        'PaperAuthorAffiliations': [
            'PaperId:long', 'AuthorId:long', 'AffiliationId:long?',
            'AuthorSequenceNumber:uint', 'OriginalAuthor:string',
            'OriginalAffiliation:string'
        ],
        'PaperCitationContexts':
        ['PaperId:long', 'PaperReferenceId:long', 'CitationContext:string'],
        'PaperExtendedAttributes':
        ['PaperId:long', 'AttributeType:int', 'AttributeValue:string'],
        'PaperFieldsOfStudy':
        ['PaperId:long', 'FieldOfStudyId:long', 'Score:float'],
        'PaperRecommendations':
        ['PaperId:long', 'RecommendedPaperId:long', 'Score:float'],
        'PaperReferences': ['PaperId:long', 'PaperReferenceId:long'],
        'PaperResources': [
            'PaperId:long', 'ResourceType:int', 'ResourceUrl:string',
            'SourceUrl:string', 'RelationshipType:int'
        ],
        'PaperUrls': [
            'PaperId:long', 'SourceType:int?', 'SourceUrl:string',
            'LanguageCode:string'
        ],
        'Papers': [
            'PaperId:long', 'Rank:uint', 'Doi:string', 'DocType:string',
            'PaperTitle:string', 'OriginalTitle:string', 'BookTitle:string',
            'Year:int?', 'Date:DateTime?', 'OnlineDate:DateTime?',
            'Publisher:string', 'JournalId:long?', 'ConferenceSeriesId:long?',
            'ConferenceInstanceId:long?', 'Volume:string', 'Issue:string',
            'FirstPage:string', 'LastPage:string', 'ReferenceCount:long',
            'CitationCount:long', 'EstimatedCitation:long',
            'OriginalVenue:string', 'FamilyId:long?', 'CreatedDate:DateTime'
        ],
        'RelatedFieldOfStudy': [
            'FieldOfStudyId1:long', 'Type1:string', 'FieldOfStudyId2:long',
            'Type2:string', 'Rank:float'
        ],
    }
コード例 #19
0
ファイル: dtypes.py プロジェクト: mikest18/cudf
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
コード例 #20
0
ファイル: test_series.py プロジェクト: miguelusque/cudf
 (
     cudf.Series([1, 2, None, 3], dtype="uint8"),
     pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
 ),
 (
     cudf.Series([23, None, None, 32], dtype="uint16"),
     pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
 ),
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series([234, 2323, 23432, None, None, 224],
               dtype=pd.UInt64Dtype()),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series([11, None, 22, 33, None, 2, None, 3],
               dtype=pd.Int32Dtype()),
 ),
 (
コード例 #21
0
ファイル: results.py プロジェクト: filipbartek/vampire-ml
def generate_problems_df(runs_df,
                         probe_runs_df=None,
                         problem_paths=None,
                         problem_base_path=None,
                         custom_runs_df=None):
    if problem_paths is None:
        problem_paths = runs_df.problem_path
    elif problem_base_path is not None:
        problem_paths = [
            os.path.relpath(problem_path, problem_base_path)
            for problem_path in problem_paths
        ]
    problem_paths = pd.Index(problem_paths).drop_duplicates()
    problems_df = pd.DataFrame(index=problem_paths)
    problems_df.index.name = 'problem_path'
    # Merge probe run results into `problems_df`
    if probe_runs_df is not None:
        problems_df = problems_df.join(probe_runs_df[[
            'problem_path', 'predicates_count', 'functions_count',
            'clauses_count'
        ]].drop_duplicates('problem_path').set_index('problem_path'),
                                       rsuffix='probe')
    # Random solve run stats
    problem_groups = runs_df.groupby(['problem_path'])
    problems_df = problems_df.join(problem_groups.size().astype(
        pd.UInt64Dtype()).to_frame('n_total'))
    problems_df = problems_df.join(
        runs_df[runs_df.status == 'completed'].groupby([
            'problem_path'
        ]).size().astype(pd.UInt64Dtype()).to_frame('n_completed'))
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_0'))
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 1].groupby(
        ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_1'))
    if 'termination_reason' in runs_df:
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Refutation'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_refutation'))
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Satisfiable'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_satisfiable'))
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Time limit'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_time_limit'))
    problems_df.fillna(
        {
            'n_total': 0,
            'n_completed': 0,
            'n_exit_0': 0,
            'n_exit_1': 0,
            'n_refutation': 0,
            'n_satisfiable': 0,
            'n_time_limit': 0
        },
        inplace=True)

    def variation(a):
        if (a == 0).all():
            # We need to handle this special case explicitly because `scipy.stats.variation` raises an exception on it.
            return 0
        res = scipy.stats.variation(a.astype(np.float), nan_policy='omit')
        if isinstance(res, np.ma.core.MaskedConstant):
            # The input array contains all nans.
            return np.nan
        return res

    agg_functions = [np.mean, np.std, variation, np.min, np.max]
    # Aggregate time measurements across successful runs
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).agg({
            field_name: agg_functions
            for field_name in [
                'time_elapsed_process', 'time_elapsed_vampire',
                'saturation_iterations'
            ]
        }))
    # Count unique numbers of saturation iterations across successful runs
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).agg({
            'saturation_iterations': ['nunique']
        }).astype(pd.UInt64Dtype()))
    # Aggregate memory across all runs
    problems_df = problems_df.join(
        runs_df.groupby(['problem_path']).agg({'memory_used': agg_functions}))
    if custom_runs_df is not None:
        for name, value in custom_runs_df.groupby(['name']):
            value = value.set_index('problem_path')
            value = value[['exit_code', 'saturation_iterations']]
            # https://stackoverflow.com/a/40225796/4054250
            value.columns = pd.MultiIndex.from_product([[name], value.columns])
            problems_df = problems_df.join(value, rsuffix=name)
    problems_df.sort_index(inplace=True)
    return problems_df
コード例 #22
0
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {
コード例 #23
0
    parquet_thrift.ConvertedType.INT_64: np.dtype('int64'),
    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
}
nullable = {
    np.dtype('int8'): pd.Int8Dtype(),
    np.dtype('int16'): pd.Int16Dtype(),
    np.dtype('int32'): pd.Int32Dtype(),
    np.dtype('int64'): pd.Int64Dtype(),
    np.dtype('uint8'): pd.UInt8Dtype(),
    np.dtype('uint16'): pd.UInt16Dtype(),
    np.dtype('uint32'): pd.UInt32Dtype(),
    np.dtype('uint64'): pd.UInt64Dtype(),
    np.dtype('bool'): pd.BooleanDtype()
}
pandas_nullable = {
    "Int8": pd.Int8Dtype(),
    "Int16": pd.Int16Dtype(),
    "Int32": pd.Int32Dtype(),
    "Int64": pd.Int64Dtype(),
    "UInt8": pd.UInt8Dtype(),
    "UInt16": pd.UInt16Dtype(),
    "UInt32": pd.UInt32Dtype(),
    "UInt64": pd.UInt64Dtype(),
    "boolean": pd.BooleanDtype()
}

コード例 #24
0
ファイル: test_column.py プロジェクト: rongou/cudf
def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
    pyarrow_data = pa.array(data, **pyarrow_kwargs)
    cudf_from_pyarrow = as_column(pyarrow_data)
    expected = as_column(data, **cudf_kwargs)
    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]
コード例 #25
0
ファイル: test_series.py プロジェクト: zkh2016/cudf
 (
     cudf.Series([1, 2, None, 3], dtype="uint8"),
     pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
 ),
 (
     cudf.Series([23, None, None, 32], dtype="uint16"),
     pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
 ),
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series(
         [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()
     ),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series(
         [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()
     ),
コード例 #26
0
ファイル: _pyreadr_writer.py プロジェクト: ty1539/pyreadr
import datetime

import numpy as np
import pandas as pd

from .librdata import Writer
from .custom_errors import PyreadrError


# configuration

int_types = {np.dtype('int32'), np.dtype('int16'), np.dtype('int8'), np.dtype('uint8'), np.dtype('uint16'),
             np.int32, np.int16, np.int8, np.uint8, np.uint16}
int_mixed_types = {pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype()}
float_types = {np.dtype('int64'), np.dtype('uint64'), np.dtype('uint32'), np.dtype('float'),
               np.int64, np.uint64, np.uint32, np.float, pd.Int64Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype()}
datetime_types = {datetime.datetime, np.datetime64}

pyreadr_to_librdata_types = {"INTEGER": "INTEGER", "NUMERIC": "NUMERIC",
                        "LOGICAL": "LOGICAL", "CHARACTER": "CHARACTER",
                        "OBJECT": "CHARACTER", "DATE": "CHARACTER",
                        "DATETIME":"CHARACTER"}
                        
librdata_min_integer = -2147483648


def get_pyreadr_column_types(df):
    """
    From a pandas data frame, get an OrderedDict with column name as key
    and pyreadr column type as value, and also a list with boolean 
    values indicating if the column has missing values (np.nan).
コード例 #27
0
ファイル: pandas_engine.py プロジェクト: pandera-dev/pandera
    type = pd.Int8Dtype()
    bit_width: int = 8


###############################################################################
# unsigned integer
###############################################################################

_register_numpy_numbers(
    builtin_name="uint",
    pandera_name="UInt",
    sizes=[64, 32, 16, 8],
)


@Engine.register_dtype(equivalents=[pd.UInt64Dtype, pd.UInt64Dtype()])
@immutable
class UINT64(DataType, dtypes.UInt):
    """Semantic representation of a :class:`pandas.UInt64Dtype`."""

    type = pd.UInt64Dtype()
    bit_width: int = 64


@Engine.register_dtype(equivalents=[pd.UInt32Dtype, pd.UInt32Dtype()])
@immutable
class UINT32(UINT64):
    """Semantic representation of a :class:`pandas.UInt32Dtype`."""

    type = pd.UInt32Dtype()
    bit_width: int = 32
コード例 #28
0
ファイル: pandas_engine.py プロジェクト: pandera-dev/pandera
class UINT64(DataType, dtypes.UInt):
    """Semantic representation of a :class:`pandas.UInt64Dtype`."""

    type = pd.UInt64Dtype()
    bit_width: int = 64
コード例 #29
0
ファイル: utils.py プロジェクト: wphicks/cudf
# Copyright (c) 2020, NVIDIA CORPORATION.

import random

import pandas as pd
import pyarrow as pa

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}


def _generate_rand_meta(obj, dtypes_list):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):
        dtype = random.choice(dtypes_list)
        null_frequency = random.uniform(0, 1)
コード例 #30
0
    "int8": "Int8",
    "datetime64[D]": "Date",
    "datetime64[ns]": "DateTime",
}

PD2CH = keymap(np.dtype, MAPPING)

PD_INT_TYPES = [
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.Int64Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype(),
]

for typ in PD_INT_TYPES:
    PD2CH[typ] = f"Nullable({typ.name})"

CH2PD = itemmap(reversed, MAPPING)
CH2PD["Null"] = "object"
CH2PD["Nothing"] = "object"

NULLABLE_COLS = [
    "Float64",
    "Float32",
    "String",
]