Exemple #1
0
    def test_df2wav(self):
        sid = get_sid(pal=True)
        test_wav = os.path.join(self.tmpdir.name, 'test.wav')
        transformer = sox.Transformer()

        for i in range(1, 5):
            test_raw_freq = int(i * 2048)
            test_real_freq = sid.real_sid_freq(test_raw_freq)
            df = pd.DataFrame([{
                'hashid': 1,
                'count': 1,
                'clock': 0,
                'freq1': test_raw_freq,
                'sus1': 15,
                'gate1': 1,
                'tri1': 1,
                'vol': 15
            }, {
                'hashid': 1,
                'count': 1,
                'clock': 1e6 * 10,
                'gate1': 0
            }],
                              dtype=pd.UInt64Dtype()).set_index('clock')
            df = df.fillna(method='ffill').astype(pd.UInt64Dtype())
            write_wav(test_wav, sid, state2samples(df, sid))
            power_df = pd.DataFrame(transformer.power_spectrum(test_wav),
                                    columns=['freq', 'val'])
            val_max = power_df['val'].max()
            freq_max = power_df[power_df['val'] == val_max].iloc[0]['freq']
            freq_diff = abs(freq_max - test_real_freq)
            self.assertLessEqual(freq_diff, 3)
Exemple #2
0
    def test_reductions_2d_axis0(self, data, method, request):
        if not hasattr(data, method):
            pytest.skip("test is not applicable for this type/dtype")

        arr2d = data.reshape(1, -1)

        kwargs = {}
        if method == "std":
            # pass ddof=0 so we get all-zero std instead of all-NA std
            kwargs["ddof"] = 0

        try:
            result = getattr(arr2d, method)(axis=0, **kwargs)
        except Exception as err:
            try:
                getattr(data, method)()
            except Exception as err2:
                assert type(err) == type(err2)
                return
            else:
                raise AssertionError("Both reductions should raise or neither")

        if method in ["mean", "median", "sum", "prod"]:
            # std and var are not dtype-preserving
            expected = data
            if method in ["sum", "prod"] and data.dtype.kind in "iub":
                # FIXME: kludge
                if data.dtype.kind in ["i", "b"]:
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.Int32Dtype()
                        else:
                            dtype = pd.Int64Dtype()
                    else:
                        dtype = pd.Int64Dtype()
                elif data.dtype.kind == "u":
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.UInt32Dtype()
                        else:
                            dtype = pd.UInt64Dtype()
                    else:
                        dtype = pd.UInt64Dtype()

                expected = data.astype(dtype)
                if data.dtype.kind == "b" and method in ["sum", "prod"]:
                    # We get IntegerArray instead of BooleanArray
                    pass
                else:
                    assert type(expected) == type(data), type(expected)
                assert dtype == expected.dtype

            self.assert_extension_array_equal(result, expected)
        elif method == "std":
            self.assert_extension_array_equal(result, data - data)
def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
Exemple #4
0
def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }
def test_intdtypes() -> None:
    pd.Int8Dtype()
    pd.Int16Dtype()
    pd.Int32Dtype()
    pd.Int64Dtype()
    pd.UInt8Dtype()
    pd.UInt16Dtype()
    pd.UInt32Dtype()
    pd.UInt64Dtype()
Exemple #6
0
    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
Exemple #7
0
 def test_notest_ssf(self):
     df = pd.DataFrame(
         [{'hashid': 1, 'count': 1, 'clock': 0, 'freq1': 1024, 'pwduty1': 0, 'atk1': 0, 'dec1': 0, 'sus1': 15, 'rel1': 0, 'gate1': 1, 'sync1': 0, 'ring1': 0, 'test1': 0, 'tri1': 1, 'saw1': 0, 'pulse1': 0, 'noise1': 0, 'flt1': 0, 'fltres': 0, 'fltcoff': 0, 'fltlo': 0, 'fltband': 0, 'flthi': 0, 'vol': 15},
          {'hashid': 1, 'count': 1, 'clock': 1e5, 'gate1': 0}], dtype=pd.UInt64Dtype())
     s = self._df2ssf(df, percussion=True)
     self.assertEqual(s.waveforms, {'tri'})
     self.assertEqual(s.midi_pitches, (35,))
     self.assertEqual(s.total_duration, 98525)
     self.assertEqual(s.midi_notes, ((0, 35, 98525, 127, 60.134765625),))
Exemple #8
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Exemple #9
0
def set_sid_dtype(df):
    df.dtype = pd.UInt64Dtype()
    for col in df.columns:
        if col.startswith('freq') or col.startswith(
                'pwduty') or col == 'fltcoff':
            col_type = pd.UInt16Dtype()
        elif col[-1].isdigit() or col.startswith('flt'):
            col_type = pd.UInt8Dtype()
        else:
            continue
        df[col] = df[col].astype(col_type)
    return df
Exemple #10
0
 def test_numeric_dtypes(self):
     dtypes = [
         bool,
         np.byte,
         np.ubyte,
         np.short,
         np.ushort,
         np.single,
         np.int32,
         np.intc,
         np.half,
         np.float16,
         np.double,
         np.float64,
         pd.StringDtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
     ]
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".xml", "xml"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         with tmpfile(suffix) as path:
             for dtype in dtypes:
                 try:
                     df = Ind2Col2.convert(Ind2Col2(
                         sample_data_ind2_col2())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
Exemple #11
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Exemple #12
0
def treenode_df(response: List[List[Any]]) -> pd.DataFrame:
    return lol_to_df(
        response,
        ["id", "parent", "user", "x", "y", "z", "radius", "confidence"],
        [
            np.uint64,
            pd.UInt64Dtype(),
            np.uint64,
            np.float64,
            np.float64,
            np.float64,
            np.float64,
            np.uint8,
        ],
    )
Exemple #13
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
Exemple #14
0
def test_to_pandas_dtype_integer_nullable():
    expectations = {
        (-100, 100): pd.Int8Dtype(),
        (0, 240): pd.UInt8Dtype(),
        (-10000, 10000): pd.Int16Dtype(),
        (500, 40000): pd.UInt16Dtype(),
        (-200000000, 200000000): pd.Int32Dtype(),
        (25, 4000000000): pd.UInt32Dtype(),
        (-9000000000000000000, 2000000000): pd.Int64Dtype(),
        (25, 10000000000000000000): pd.UInt64Dtype(),
        (25, 1000000000000000000000000000): np.float128,
        (None, None): pd.Int64Dtype(),
    }
    for (min_, max_), expected_pandas_type in expectations.items():
        constraints = RecordsSchemaFieldIntegerConstraints(required=True,
                                                           unique=None,
                                                           min_=min_,
                                                           max_=max_)
        yield with_nullable(
            True, check_dtype), "integer", constraints, expected_pandas_type
Exemple #15
0
revmap = {
    parquet_thrift.Type.INT32: np.int32,
    parquet_thrift.Type.INT64: np.int64,
    parquet_thrift.Type.FLOAT: np.float32,
    parquet_thrift.Type.DOUBLE: np.float64
}

pdoptional_to_numpy_typemap = {
    pd.Int8Dtype(): np.int8,
    pd.Int16Dtype(): np.int16,
    pd.Int32Dtype(): np.int32,
    pd.Int64Dtype(): np.int64,
    pd.UInt8Dtype(): np.uint8,
    pd.UInt16Dtype(): np.uint16,
    pd.UInt32Dtype(): np.uint32,
    pd.UInt64Dtype(): np.uint64,
    pd.BooleanDtype(): np.bool
}


def find_type(data, fixed_text=None, object_encoding=None, times='int64'):
    """ Get appropriate typecodes for column dtype

    Data conversion do not happen here, see convert().

    The user is expected to transform their data into the appropriate dtype
    before saving to parquet, we will not make any assumptions for them.

    Known types that cannot be represented (must be first converted another
    type or to raw binary): float128, complex
Exemple #16
0
from dask_sql.java import SqlTypeName

# Default mapping between python types and SQL types
_PYTHON_TO_SQL = {
    np.float64: SqlTypeName.DOUBLE,
    np.float32: SqlTypeName.FLOAT,
    np.int64: SqlTypeName.BIGINT,
    pd.Int64Dtype(): SqlTypeName.BIGINT,
    np.int32: SqlTypeName.INTEGER,
    pd.Int32Dtype(): SqlTypeName.INTEGER,
    np.int16: SqlTypeName.SMALLINT,
    pd.Int16Dtype(): SqlTypeName.SMALLINT,
    np.int8: SqlTypeName.TINYINT,
    pd.Int8Dtype(): SqlTypeName.TINYINT,
    np.uint64: SqlTypeName.BIGINT,
    pd.UInt64Dtype(): SqlTypeName.BIGINT,
    np.uint32: SqlTypeName.INTEGER,
    pd.UInt32Dtype(): SqlTypeName.INTEGER,
    np.uint16: SqlTypeName.SMALLINT,
    pd.UInt16Dtype(): SqlTypeName.SMALLINT,
    np.uint8: SqlTypeName.TINYINT,
    pd.UInt8Dtype(): SqlTypeName.TINYINT,
    np.bool8: SqlTypeName.BOOLEAN,
    pd.BooleanDtype(): SqlTypeName.BOOLEAN,
    np.object_: SqlTypeName.VARCHAR,
    pd.StringDtype(): SqlTypeName.VARCHAR,
    np.datetime64: SqlTypeName.TIMESTAMP,
}

# Default mapping between SQL types and python types
# for values
class MicrosoftAcademicGraph:

    # constructor
    def __init__(self, root):
        self.root = root

    # return stream path
    def get_full_path(self, stream_name):
        return self.root + stream_name + ".txt"

    # return stream header
    def get_header(self, stream_name):
        return self.streams[stream_name]

    # return stream types and columns with date
    def get_type(self, stream_name):
        date_columns = []
        schema = {}
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(":")
            nullable = fieldtype.endswith("?")
            if nullable:
                fieldtype = fieldtype[:-1]
            if fieldtype == "DateTime":
                date_columns.append(fieldname)
            schema[fieldname] = self.datatypedict[fieldtype]
        return schema, date_columns

    # return stream columns names
    def get_name(self, stream_name):
        names = []
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(":")
            names.append(fieldname)
        return names

    # return stream Pandas dataFrame
    def get_data_frame(self, stream_name):
        column_name = self.get_name(stream_name)
        column_type, date_columns = self.get_type(stream_name)
        return pd.read_csv(
            filepath_or_buffer=self.get_full_path(stream_name),
            parse_dates=date_columns,
            low_memory=False,
            names=column_name,
            dtype=column_type,
            date_parser=self.date_parse_func,
            sep="\t",
        )

    # date parse function
    date_parse_func = lambda self, c: pd.to_datetime(
        c, format="%m/%d/%Y %H:%M:%S %p", errors="coerce"
    )  # 6/24/2016 12:00:00 AM

    # convert input datatype to Pandas datatype
    datatypedict = {
        "int": pd.Int32Dtype(),
        "uint": pd.UInt32Dtype(),
        "long": pd.Int64Dtype(),
        "ulong": pd.UInt64Dtype(),
        "float": np.float32,
        "string": np.string_,
        "DateTime": np.string_,
    }

    # define stream dictionary
    streams = {
        "Affiliations": [
            "AffiliationId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "GridId:string",
            "OfficialPage:string",
            "WikiPage:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "Latitude:float?",
            "Longitude:float?",
            "CreatedDate:DateTime",
        ],
        "Authors": [
            "AuthorId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "LastKnownAffiliationId:long?",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "ConferenceInstances": [
            "ConferenceInstanceId:long",
            "NormalizedName:string",
            "DisplayName:string",
            "ConferenceSeriesId:long",
            "Location:string",
            "OfficialUrl:string",
            "StartDate:DateTime?",
            "EndDate:DateTime?",
            "AbstractRegistrationDate:DateTime?",
            "SubmissionDeadlineDate:DateTime?",
            "NotificationDueDate:DateTime?",
            "FinalVersionDueDate:DateTime?",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "Latitude:float?",
            "Longitude:float?",
            "CreatedDate:DateTime",
        ],
        "ConferenceSeries": [
            "ConferenceSeriesId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "EntityRelatedEntities": [
            "EntityId:long",
            "EntityType:string",
            "RelatedEntityId:long",
            "RelatedEntityType:string",
            "RelatedType:int",
            "Score:float",
        ],
        "FieldOfStudyChildren":
        ["FieldOfStudyId:long", "ChildFieldOfStudyId:long"],
        "FieldOfStudyExtendedAttributes": [
            "FieldOfStudyId:long",
            "AttributeType:int",
            "AttributeValue:string",
        ],
        "FieldsOfStudy": [
            "FieldOfStudyId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "MainType:string",
            "Level:int",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "Journals": [
            "JournalId:long",
            "Rank:uint",
            "NormalizedName:string",
            "DisplayName:string",
            "Issn:string",
            "Publisher:string",
            "Webpage:string",
            "PaperCount:long",
            "PaperFamilyCount:long",
            "CitationCount:long",
            "CreatedDate:DateTime",
        ],
        "PaperAbstractsInvertedIndex":
        ["PaperId:long", "IndexedAbstract:string"],
        "PaperAuthorAffiliations": [
            "PaperId:long",
            "AuthorId:long",
            "AffiliationId:long?",
            "AuthorSequenceNumber:uint",
            "OriginalAuthor:string",
            "OriginalAffiliation:string",
        ],
        "PaperCitationContexts": [
            "PaperId:long",
            "PaperReferenceId:long",
            "CitationContext:string",
        ],
        "PaperExtendedAttributes": [
            "PaperId:long",
            "AttributeType:int",
            "AttributeValue:string",
        ],
        "PaperFieldsOfStudy":
        ["PaperId:long", "FieldOfStudyId:long", "Score:float"],
        "PaperRecommendations": [
            "PaperId:long",
            "RecommendedPaperId:long",
            "Score:float",
        ],
        "PaperReferences": ["PaperId:long", "PaperReferenceId:long"],
        "PaperResources": [
            "PaperId:long",
            "ResourceType:int",
            "ResourceUrl:string",
            "SourceUrl:string",
            "RelationshipType:int",
        ],
        "PaperUrls": [
            "PaperId:long",
            "SourceType:int?",
            "SourceUrl:string",
            "LanguageCode:string",
        ],
        "Papers": [
            "PaperId:long",
            "Rank:uint",
            "Doi:string",
            "DocType:string",
            "PaperTitle:string",
            "OriginalTitle:string",
            "BookTitle:string",
            "Year:int?",
            "Date:DateTime?",
            "OnlineDate:DateTime?",
            "Publisher:string",
            "JournalId:long?",
            "ConferenceSeriesId:long?",
            "ConferenceInstanceId:long?",
            "Volume:string",
            "Issue:string",
            "FirstPage:string",
            "LastPage:string",
            "ReferenceCount:long",
            "CitationCount:long",
            "EstimatedCitation:long",
            "OriginalVenue:string",
            "FamilyId:long?",
            "CreatedDate:DateTime",
        ],
        "RelatedFieldOfStudy": [
            "FieldOfStudyId1:long",
            "Type1:string",
            "FieldOfStudyId2:long",
            "Type2:string",
            "Rank:float",
        ],
    }
class MicrosoftAcademicGraph():

    # constructor
    def __init__(self, root):
        self.root = root

    # return stream path
    def get_full_path(self, stream_name):
        return self.root + stream_name + '.txt'

    # return stream header
    def get_header(self, stream_name):
        return self.streams[stream_name]

    # return stream types and columns with date
    def get_type(self, stream_name):
        date_columns = []
        schema = {}
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(':')
            nullable = fieldtype.endswith('?')
            if nullable:
                fieldtype = fieldtype[:-1]
            if fieldtype == 'DateTime':
                date_columns.append(fieldname)
            schema[fieldname] = self.datatypedict[fieldtype]
        return schema, date_columns

    # return stream columns names
    def get_name(self, stream_name):
        names = []
        for field in self.streams[stream_name]:
            fieldname, fieldtype = field.split(':')
            names.append(fieldname)
        return names

    # return stream Pandas dataFrame
    def get_data_frame(self, stream_name):
        column_name = self.get_name(stream_name)
        column_type, date_columns = self.get_type(stream_name)
        return pd.read_csv(filepath_or_buffer=self.get_full_path(stream_name),
                           parse_dates=date_columns,
                           low_memory=False,
                           names=column_name,
                           dtype=column_type,
                           date_parser=self.date_parse_func,
                           sep='\t')

    # date parse function
    date_parse_func = lambda self, c: pd.to_datetime(
        c, format='%m/%d/%Y %H:%M:%S %p', errors='coerce'
    )  #6/24/2016 12:00:00 AM

    # convert input datatype to Pandas datatype
    datatypedict = {
        'int': pd.Int32Dtype(),
        'uint': pd.UInt32Dtype(),
        'long': pd.Int64Dtype(),
        'ulong': pd.UInt64Dtype(),
        'float': np.float32,
        'string': np.string_,
        'DateTime': np.string_,
    }

    # define stream dictionary
    streams = {
        'Affiliations': [
            'AffiliationId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'GridId:string', 'OfficialPage:string',
            'WikiPage:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'Latitude:float?', 'Longitude:float?',
            'CreatedDate:DateTime'
        ],
        'Authors': [
            'AuthorId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'LastKnownAffiliationId:long?',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'CreatedDate:DateTime'
        ],
        'ConferenceInstances': [
            'ConferenceInstanceId:long', 'NormalizedName:string',
            'DisplayName:string', 'ConferenceSeriesId:long', 'Location:string',
            'OfficialUrl:string', 'StartDate:DateTime?', 'EndDate:DateTime?',
            'AbstractRegistrationDate:DateTime?',
            'SubmissionDeadlineDate:DateTime?',
            'NotificationDueDate:DateTime?', 'FinalVersionDueDate:DateTime?',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'Latitude:float?', 'Longitude:float?', 'CreatedDate:DateTime'
        ],
        'ConferenceSeries': [
            'ConferenceSeriesId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'CreatedDate:DateTime'
        ],
        'EntityRelatedEntities': [
            'EntityId:long', 'EntityType:string', 'RelatedEntityId:long',
            'RelatedEntityType:string', 'RelatedType:int', 'Score:float'
        ],
        'FieldOfStudyChildren':
        ['FieldOfStudyId:long', 'ChildFieldOfStudyId:long'],
        'FieldOfStudyExtendedAttributes':
        ['FieldOfStudyId:long', 'AttributeType:int', 'AttributeValue:string'],
        'FieldsOfStudy': [
            'FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'MainType:string', 'Level:int',
            'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long',
            'CreatedDate:DateTime'
        ],
        'Journals': [
            'JournalId:long', 'Rank:uint', 'NormalizedName:string',
            'DisplayName:string', 'Issn:string', 'Publisher:string',
            'Webpage:string', 'PaperCount:long', 'PaperFamilyCount:long',
            'CitationCount:long', 'CreatedDate:DateTime'
        ],
        'PaperAbstractsInvertedIndex':
        ['PaperId:long', 'IndexedAbstract:string'],
        'PaperAuthorAffiliations': [
            'PaperId:long', 'AuthorId:long', 'AffiliationId:long?',
            'AuthorSequenceNumber:uint', 'OriginalAuthor:string',
            'OriginalAffiliation:string'
        ],
        'PaperCitationContexts':
        ['PaperId:long', 'PaperReferenceId:long', 'CitationContext:string'],
        'PaperExtendedAttributes':
        ['PaperId:long', 'AttributeType:int', 'AttributeValue:string'],
        'PaperFieldsOfStudy':
        ['PaperId:long', 'FieldOfStudyId:long', 'Score:float'],
        'PaperRecommendations':
        ['PaperId:long', 'RecommendedPaperId:long', 'Score:float'],
        'PaperReferences': ['PaperId:long', 'PaperReferenceId:long'],
        'PaperResources': [
            'PaperId:long', 'ResourceType:int', 'ResourceUrl:string',
            'SourceUrl:string', 'RelationshipType:int'
        ],
        'PaperUrls': [
            'PaperId:long', 'SourceType:int?', 'SourceUrl:string',
            'LanguageCode:string'
        ],
        'Papers': [
            'PaperId:long', 'Rank:uint', 'Doi:string', 'DocType:string',
            'PaperTitle:string', 'OriginalTitle:string', 'BookTitle:string',
            'Year:int?', 'Date:DateTime?', 'OnlineDate:DateTime?',
            'Publisher:string', 'JournalId:long?', 'ConferenceSeriesId:long?',
            'ConferenceInstanceId:long?', 'Volume:string', 'Issue:string',
            'FirstPage:string', 'LastPage:string', 'ReferenceCount:long',
            'CitationCount:long', 'EstimatedCitation:long',
            'OriginalVenue:string', 'FamilyId:long?', 'CreatedDate:DateTime'
        ],
        'RelatedFieldOfStudy': [
            'FieldOfStudyId1:long', 'Type1:string', 'FieldOfStudyId2:long',
            'Type2:string', 'Rank:float'
        ],
    }
Exemple #19
0
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
Exemple #20
0
 (
     cudf.Series([1, 2, None, 3], dtype="uint8"),
     pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
 ),
 (
     cudf.Series([23, None, None, 32], dtype="uint16"),
     pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
 ),
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series([234, 2323, 23432, None, None, 224],
               dtype=pd.UInt64Dtype()),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series([11, None, 22, 33, None, 2, None, 3],
               dtype=pd.Int32Dtype()),
 ),
 (
Exemple #21
0
def generate_problems_df(runs_df,
                         probe_runs_df=None,
                         problem_paths=None,
                         problem_base_path=None,
                         custom_runs_df=None):
    if problem_paths is None:
        problem_paths = runs_df.problem_path
    elif problem_base_path is not None:
        problem_paths = [
            os.path.relpath(problem_path, problem_base_path)
            for problem_path in problem_paths
        ]
    problem_paths = pd.Index(problem_paths).drop_duplicates()
    problems_df = pd.DataFrame(index=problem_paths)
    problems_df.index.name = 'problem_path'
    # Merge probe run results into `problems_df`
    if probe_runs_df is not None:
        problems_df = problems_df.join(probe_runs_df[[
            'problem_path', 'predicates_count', 'functions_count',
            'clauses_count'
        ]].drop_duplicates('problem_path').set_index('problem_path'),
                                       rsuffix='probe')
    # Random solve run stats
    problem_groups = runs_df.groupby(['problem_path'])
    problems_df = problems_df.join(problem_groups.size().astype(
        pd.UInt64Dtype()).to_frame('n_total'))
    problems_df = problems_df.join(
        runs_df[runs_df.status == 'completed'].groupby([
            'problem_path'
        ]).size().astype(pd.UInt64Dtype()).to_frame('n_completed'))
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_0'))
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 1].groupby(
        ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_1'))
    if 'termination_reason' in runs_df:
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Refutation'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_refutation'))
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Satisfiable'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_satisfiable'))
        problems_df = problems_df.join(
            runs_df[runs_df.termination_reason == 'Time limit'].groupby([
                'problem_path'
            ]).size().astype(pd.UInt64Dtype()).to_frame('n_time_limit'))
    problems_df.fillna(
        {
            'n_total': 0,
            'n_completed': 0,
            'n_exit_0': 0,
            'n_exit_1': 0,
            'n_refutation': 0,
            'n_satisfiable': 0,
            'n_time_limit': 0
        },
        inplace=True)

    def variation(a):
        if (a == 0).all():
            # We need to handle this special case explicitly because `scipy.stats.variation` raises an exception on it.
            return 0
        res = scipy.stats.variation(a.astype(np.float), nan_policy='omit')
        if isinstance(res, np.ma.core.MaskedConstant):
            # The input array contains all nans.
            return np.nan
        return res

    agg_functions = [np.mean, np.std, variation, np.min, np.max]
    # Aggregate time measurements across successful runs
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).agg({
            field_name: agg_functions
            for field_name in [
                'time_elapsed_process', 'time_elapsed_vampire',
                'saturation_iterations'
            ]
        }))
    # Count unique numbers of saturation iterations across successful runs
    problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby(
        ['problem_path']).agg({
            'saturation_iterations': ['nunique']
        }).astype(pd.UInt64Dtype()))
    # Aggregate memory across all runs
    problems_df = problems_df.join(
        runs_df.groupby(['problem_path']).agg({'memory_used': agg_functions}))
    if custom_runs_df is not None:
        for name, value in custom_runs_df.groupby(['name']):
            value = value.set_index('problem_path')
            value = value[['exit_code', 'saturation_iterations']]
            # https://stackoverflow.com/a/40225796/4054250
            value.columns = pd.MultiIndex.from_product([[name], value.columns])
            problems_df = problems_df.join(value, rsuffix=name)
    problems_df.sort_index(inplace=True)
    return problems_df
Exemple #22
0
    np.int8: pa.int8(),
    np.bool_: pa.int8(),
    np.uint64: pa.uint64(),
    np.uint32: pa.uint32(),
    np.uint16: pa.uint16(),
    np.uint8: pa.uint8(),
    np.datetime64: pa.date64(),
    np.object_: pa.string(),
    np.str_: pa.string(),
}

cudf_dtypes_to_pandas_dtypes = {
    np.dtype("uint8"): pd.UInt8Dtype(),
    np.dtype("uint16"): pd.UInt16Dtype(),
    np.dtype("uint32"): pd.UInt32Dtype(),
    np.dtype("uint64"): pd.UInt64Dtype(),
    np.dtype("int8"): pd.Int8Dtype(),
    np.dtype("int16"): pd.Int16Dtype(),
    np.dtype("int32"): pd.Int32Dtype(),
    np.dtype("int64"): pd.Int64Dtype(),
    np.dtype("bool_"): pd.BooleanDtype(),
    np.dtype("object"): pd.StringDtype(),
}

SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"}
UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"}
INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES
FLOAT_TYPES = {"float32", "float64"}
SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES
NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES
DATETIME_TYPES = {
Exemple #23
0
    parquet_thrift.ConvertedType.INT_64: np.dtype('int64'),
    parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'),
    parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'),
    parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]')
}
nullable = {
    np.dtype('int8'): pd.Int8Dtype(),
    np.dtype('int16'): pd.Int16Dtype(),
    np.dtype('int32'): pd.Int32Dtype(),
    np.dtype('int64'): pd.Int64Dtype(),
    np.dtype('uint8'): pd.UInt8Dtype(),
    np.dtype('uint16'): pd.UInt16Dtype(),
    np.dtype('uint32'): pd.UInt32Dtype(),
    np.dtype('uint64'): pd.UInt64Dtype(),
    np.dtype('bool'): pd.BooleanDtype()
}
pandas_nullable = {
    "Int8": pd.Int8Dtype(),
    "Int16": pd.Int16Dtype(),
    "Int32": pd.Int32Dtype(),
    "Int64": pd.Int64Dtype(),
    "UInt8": pd.UInt8Dtype(),
    "UInt16": pd.UInt16Dtype(),
    "UInt32": pd.UInt32Dtype(),
    "UInt64": pd.UInt64Dtype(),
    "boolean": pd.BooleanDtype()
}

Exemple #24
0
def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
    pyarrow_data = pa.array(data, **pyarrow_kwargs)
    cudf_from_pyarrow = as_column(pyarrow_data)
    expected = as_column(data, **cudf_kwargs)
    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))


@pytest.mark.parametrize(
    "pd_dtype,expect_dtype",
    [
        # TODO: Nullable float is coming
        (pd.StringDtype(), np.dtype("O")),
        (pd.UInt8Dtype(), np.dtype("uint8")),
        (pd.UInt16Dtype(), np.dtype("uint16")),
        (pd.UInt32Dtype(), np.dtype("uint32")),
        (pd.UInt64Dtype(), np.dtype("uint64")),
        (pd.Int8Dtype(), np.dtype("int8")),
        (pd.Int16Dtype(), np.dtype("int16")),
        (pd.Int32Dtype(), np.dtype("int32")),
        (pd.Int64Dtype(), np.dtype("int64")),
        (pd.BooleanDtype(), np.dtype("bool")),
    ],
)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]
Exemple #25
0
 (
     cudf.Series([1, 2, None, 3], dtype="uint8"),
     pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()),
 ),
 (
     cudf.Series([23, None, None, 32], dtype="uint16"),
     pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()),
 ),
 (
     cudf.Series([None, 123, None, 1], dtype="uint32"),
     pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()),
 ),
 (
     cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"),
     pd.Series(
         [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()
     ),
 ),
 (
     cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"),
     pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()),
 ),
 (
     cudf.Series([111, None, 222, None, 13], dtype="int16"),
     pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()),
 ),
 (
     cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"),
     pd.Series(
         [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()
     ),
Exemple #26
0
import datetime

import numpy as np
import pandas as pd

from .librdata import Writer
from .custom_errors import PyreadrError


# configuration

int_types = {np.dtype('int32'), np.dtype('int16'), np.dtype('int8'), np.dtype('uint8'), np.dtype('uint16'),
             np.int32, np.int16, np.int8, np.uint8, np.uint16}
int_mixed_types = {pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype()}
float_types = {np.dtype('int64'), np.dtype('uint64'), np.dtype('uint32'), np.dtype('float'),
               np.int64, np.uint64, np.uint32, np.float, pd.Int64Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype()}
datetime_types = {datetime.datetime, np.datetime64}

pyreadr_to_librdata_types = {"INTEGER": "INTEGER", "NUMERIC": "NUMERIC",
                        "LOGICAL": "LOGICAL", "CHARACTER": "CHARACTER",
                        "OBJECT": "CHARACTER", "DATE": "CHARACTER",
                        "DATETIME":"CHARACTER"}
                        
librdata_min_integer = -2147483648


def get_pyreadr_column_types(df):
    """
    From a pandas data frame, get an OrderedDict with column name as key
    and pyreadr column type as value, and also a list with boolean 
    values indicating if the column has missing values (np.nan).
Exemple #27
0
    type = pd.Int8Dtype()
    bit_width: int = 8


###############################################################################
# unsigned integer
###############################################################################

_register_numpy_numbers(
    builtin_name="uint",
    pandera_name="UInt",
    sizes=[64, 32, 16, 8],
)


@Engine.register_dtype(equivalents=[pd.UInt64Dtype, pd.UInt64Dtype()])
@immutable
class UINT64(DataType, dtypes.UInt):
    """Semantic representation of a :class:`pandas.UInt64Dtype`."""

    type = pd.UInt64Dtype()
    bit_width: int = 64


@Engine.register_dtype(equivalents=[pd.UInt32Dtype, pd.UInt32Dtype()])
@immutable
class UINT32(UINT64):
    """Semantic representation of a :class:`pandas.UInt32Dtype`."""

    type = pd.UInt32Dtype()
    bit_width: int = 32
Exemple #28
0
class UINT64(DataType, dtypes.UInt):
    """Semantic representation of a :class:`pandas.UInt64Dtype`."""

    type = pd.UInt64Dtype()
    bit_width: int = 64
Exemple #29
0
# Copyright (c) 2020, NVIDIA CORPORATION.

import random

import pandas as pd
import pyarrow as pa

pyarrow_dtypes_to_pandas_dtypes = {
    pa.uint8(): pd.UInt8Dtype(),
    pa.uint16(): pd.UInt16Dtype(),
    pa.uint32(): pd.UInt32Dtype(),
    pa.uint64(): pd.UInt64Dtype(),
    pa.int8(): pd.Int8Dtype(),
    pa.int16(): pd.Int16Dtype(),
    pa.int32(): pd.Int32Dtype(),
    pa.int64(): pd.Int64Dtype(),
    pa.bool_(): pd.BooleanDtype(),
    pa.string(): pd.StringDtype(),
}


def _generate_rand_meta(obj, dtypes_list):
    obj._current_params = {}
    num_rows = obj._rand(obj._max_rows)
    num_cols = obj._rand(obj._max_columns)

    dtypes_meta = []

    for _ in range(num_cols):
        dtype = random.choice(dtypes_list)
        null_frequency = random.uniform(0, 1)
Exemple #30
0
    "int8": "Int8",
    "datetime64[D]": "Date",
    "datetime64[ns]": "DateTime",
}

PD2CH = keymap(np.dtype, MAPPING)

PD_INT_TYPES = [
    pd.Int8Dtype(),
    pd.Int16Dtype(),
    pd.Int32Dtype(),
    pd.Int64Dtype(),
    pd.UInt8Dtype(),
    pd.UInt16Dtype(),
    pd.UInt32Dtype(),
    pd.UInt64Dtype(),
]

for typ in PD_INT_TYPES:
    PD2CH[typ] = f"Nullable({typ.name})"

CH2PD = itemmap(reversed, MAPPING)
CH2PD["Null"] = "object"
CH2PD["Nothing"] = "object"

NULLABLE_COLS = [
    "Float64",
    "Float32",
    "String",
]