def test_df2wav(self): sid = get_sid(pal=True) test_wav = os.path.join(self.tmpdir.name, 'test.wav') transformer = sox.Transformer() for i in range(1, 5): test_raw_freq = int(i * 2048) test_real_freq = sid.real_sid_freq(test_raw_freq) df = pd.DataFrame([{ 'hashid': 1, 'count': 1, 'clock': 0, 'freq1': test_raw_freq, 'sus1': 15, 'gate1': 1, 'tri1': 1, 'vol': 15 }, { 'hashid': 1, 'count': 1, 'clock': 1e6 * 10, 'gate1': 0 }], dtype=pd.UInt64Dtype()).set_index('clock') df = df.fillna(method='ffill').astype(pd.UInt64Dtype()) write_wav(test_wav, sid, state2samples(df, sid)) power_df = pd.DataFrame(transformer.power_spectrum(test_wav), columns=['freq', 'val']) val_max = power_df['val'].max() freq_max = power_df[power_df['val'] == val_max].iloc[0]['freq'] freq_diff = abs(freq_max - test_real_freq) self.assertLessEqual(freq_diff, 3)
def test_reductions_2d_axis0(self, data, method, request): if not hasattr(data, method): pytest.skip("test is not applicable for this type/dtype") arr2d = data.reshape(1, -1) kwargs = {} if method == "std": # pass ddof=0 so we get all-zero std instead of all-NA std kwargs["ddof"] = 0 try: result = getattr(arr2d, method)(axis=0, **kwargs) except Exception as err: try: getattr(data, method)() except Exception as err2: assert type(err) == type(err2) return else: raise AssertionError("Both reductions should raise or neither") if method in ["mean", "median", "sum", "prod"]: # std and var are not dtype-preserving expected = data if method in ["sum", "prod"] and data.dtype.kind in "iub": # FIXME: kludge if data.dtype.kind in ["i", "b"]: if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds if result.dtype.itemsize == 4: dtype = pd.Int32Dtype() else: dtype = pd.Int64Dtype() else: dtype = pd.Int64Dtype() elif data.dtype.kind == "u": if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds if result.dtype.itemsize == 4: dtype = pd.UInt32Dtype() else: dtype = pd.UInt64Dtype() else: dtype = pd.UInt64Dtype() expected = data.astype(dtype) if data.dtype.kind == "b" and method in ["sum", "prod"]: # We get IntegerArray instead of BooleanArray pass else: assert type(expected) == type(data), type(expected) assert dtype == expected.dtype self.assert_extension_array_equal(result, expected) elif method == "std": self.assert_extension_array_equal(result, data - data)
def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): return pd.Int8Dtype() if pa.types.is_int16(dtype): return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_uint8(dtype): return pd.UInt8Dtype() if pa.types.is_uint16(dtype): return pd.UInt16Dtype() if pa.types.is_uint32(dtype): return pd.UInt32Dtype() if pa.types.is_uint64(dtype): return pd.UInt64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def integer_type_mapping( use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]: if use_extension_types: return { IntegerType.INT8: pd.Int8Dtype(), IntegerType.UINT8: pd.UInt8Dtype(), IntegerType.INT16: pd.Int16Dtype(), IntegerType.UINT16: pd.UInt16Dtype(), IntegerType.INT24: pd.Int32Dtype(), IntegerType.UINT24: pd.Int32Dtype(), IntegerType.INT32: pd.Int32Dtype(), IntegerType.UINT32: pd.UInt32Dtype(), IntegerType.INT64: pd.Int64Dtype(), IntegerType.UINT64: pd.UInt64Dtype(), } else: return { IntegerType.INT8: np.int8, IntegerType.UINT8: np.uint8, IntegerType.INT16: np.int16, IntegerType.UINT16: np.uint16, IntegerType.INT24: np.int32, IntegerType.UINT24: np.uint32, IntegerType.INT32: np.int32, IntegerType.UINT32: np.uint32, IntegerType.INT64: np.int64, IntegerType.UINT64: np.uint64, }
def test_intdtypes() -> None: pd.Int8Dtype() pd.Int16Dtype() pd.Int32Dtype() pd.Int64Dtype() pd.UInt8Dtype() pd.UInt16Dtype() pd.UInt32Dtype() pd.UInt64Dtype()
def __init__(self, pandas_obj): # validate and assign object self._validate(pandas_obj) self._obj = pandas_obj # define incorporated modules - columns consisting of others will not have the dtype changed self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas'] # define a possible list of null values self._NULL_VALS = [ None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf, '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk', 'UNKNOWN', 'UNK' ] # assign dtypes and limits # boolean BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on'] BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off'] self._BOOL_MAP_DICT = {i: True for i in BOOL_STRINGS_TRUE }.update({i: False for i in BOOL_STRINGS_FALSE}) self._DTYPE_BOOL_BASE = np.bool self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype() # unsigned integers - base and nullable self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64] self._DTYPES_UINT_NULLABLE = [ pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype() ] self._LIMIT_LOW_UINT = [ np.iinfo(i).min for i in self._DTYPES_UINT_BASE ] self._LIMIT_HIGH_UINT = [ np.iinfo(i).max for i in self._DTYPES_UINT_BASE ] # signed integers - base and nullable self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64] self._DTYPES_INT_NULLABLE = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype() ] self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE] self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE] # floats - nullable by default self._DTYPES_FLOAT = [np.float16, np.float32, np.float64] # datetime - nullable by default self._DTYPE_DATETIME = np.datetime64 # string self._DTYPE_STRING = pd.StringDtype() # categorical - nullable by default self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
def test_notest_ssf(self): df = pd.DataFrame( [{'hashid': 1, 'count': 1, 'clock': 0, 'freq1': 1024, 'pwduty1': 0, 'atk1': 0, 'dec1': 0, 'sus1': 15, 'rel1': 0, 'gate1': 1, 'sync1': 0, 'ring1': 0, 'test1': 0, 'tri1': 1, 'saw1': 0, 'pulse1': 0, 'noise1': 0, 'flt1': 0, 'fltres': 0, 'fltcoff': 0, 'fltlo': 0, 'fltband': 0, 'flthi': 0, 'vol': 15}, {'hashid': 1, 'count': 1, 'clock': 1e5, 'gate1': 0}], dtype=pd.UInt64Dtype()) s = self._df2ssf(df, percussion=True) self.assertEqual(s.waveforms, {'tri'}) self.assertEqual(s.midi_pitches, (35,)) self.assertEqual(s.total_duration, 98525) self.assertEqual(s.midi_notes, ((0, 35, 98525, 127, 60.134765625),))
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ): kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed" ) manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def set_sid_dtype(df): df.dtype = pd.UInt64Dtype() for col in df.columns: if col.startswith('freq') or col.startswith( 'pwduty') or col == 'fltcoff': col_type = pd.UInt16Dtype() elif col[-1].isdigit() or col.startswith('flt'): col_type = pd.UInt8Dtype() else: continue df[col] = df[col].astype(col_type) return df
def test_numeric_dtypes(self): dtypes = [ bool, np.byte, np.ubyte, np.short, np.ushort, np.single, np.int32, np.intc, np.half, np.float16, np.double, np.float64, pd.StringDtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), ] for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".xml", "xml"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: with tmpfile(suffix) as path: for dtype in dtypes: try: df = Ind2Col2.convert(Ind2Col2( sample_data_ind2_col2())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), self.api.float32(): pd.Float32Dtype(), self.api.float64(): pd.Float64Dtype(), } to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def treenode_df(response: List[List[Any]]) -> pd.DataFrame: return lol_to_df( response, ["id", "parent", "user", "x", "y", "z", "radius", "confidence"], [ np.uint64, pd.UInt64Dtype(), np.uint64, np.float64, np.float64, np.float64, np.float64, np.uint8, ], )
def test_numeric_nullable_dtypes(self): dtypes = [ pd.StringDtype(), pd.BooleanDtype(), pd.Float64Dtype(), pd.Float32Dtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), pd.StringDtype(), ] # TODO: Re-add (".xml", "xml"), # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46 for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: # TODO: include xml for dtype in dtypes: with tmpfile(suffix) as path: try: df = Ind2Col2.convert( Ind2Col2( sample_data_ind2_col2_pd_na())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_to_pandas_dtype_integer_nullable(): expectations = { (-100, 100): pd.Int8Dtype(), (0, 240): pd.UInt8Dtype(), (-10000, 10000): pd.Int16Dtype(), (500, 40000): pd.UInt16Dtype(), (-200000000, 200000000): pd.Int32Dtype(), (25, 4000000000): pd.UInt32Dtype(), (-9000000000000000000, 2000000000): pd.Int64Dtype(), (25, 10000000000000000000): pd.UInt64Dtype(), (25, 1000000000000000000000000000): np.float128, (None, None): pd.Int64Dtype(), } for (min_, max_), expected_pandas_type in expectations.items(): constraints = RecordsSchemaFieldIntegerConstraints(required=True, unique=None, min_=min_, max_=max_) yield with_nullable( True, check_dtype), "integer", constraints, expected_pandas_type
revmap = { parquet_thrift.Type.INT32: np.int32, parquet_thrift.Type.INT64: np.int64, parquet_thrift.Type.FLOAT: np.float32, parquet_thrift.Type.DOUBLE: np.float64 } pdoptional_to_numpy_typemap = { pd.Int8Dtype(): np.int8, pd.Int16Dtype(): np.int16, pd.Int32Dtype(): np.int32, pd.Int64Dtype(): np.int64, pd.UInt8Dtype(): np.uint8, pd.UInt16Dtype(): np.uint16, pd.UInt32Dtype(): np.uint32, pd.UInt64Dtype(): np.uint64, pd.BooleanDtype(): np.bool } def find_type(data, fixed_text=None, object_encoding=None, times='int64'): """ Get appropriate typecodes for column dtype Data conversion do not happen here, see convert(). The user is expected to transform their data into the appropriate dtype before saving to parquet, we will not make any assumptions for them. Known types that cannot be represented (must be first converted another type or to raw binary): float128, complex
from dask_sql.java import SqlTypeName # Default mapping between python types and SQL types _PYTHON_TO_SQL = { np.float64: SqlTypeName.DOUBLE, np.float32: SqlTypeName.FLOAT, np.int64: SqlTypeName.BIGINT, pd.Int64Dtype(): SqlTypeName.BIGINT, np.int32: SqlTypeName.INTEGER, pd.Int32Dtype(): SqlTypeName.INTEGER, np.int16: SqlTypeName.SMALLINT, pd.Int16Dtype(): SqlTypeName.SMALLINT, np.int8: SqlTypeName.TINYINT, pd.Int8Dtype(): SqlTypeName.TINYINT, np.uint64: SqlTypeName.BIGINT, pd.UInt64Dtype(): SqlTypeName.BIGINT, np.uint32: SqlTypeName.INTEGER, pd.UInt32Dtype(): SqlTypeName.INTEGER, np.uint16: SqlTypeName.SMALLINT, pd.UInt16Dtype(): SqlTypeName.SMALLINT, np.uint8: SqlTypeName.TINYINT, pd.UInt8Dtype(): SqlTypeName.TINYINT, np.bool8: SqlTypeName.BOOLEAN, pd.BooleanDtype(): SqlTypeName.BOOLEAN, np.object_: SqlTypeName.VARCHAR, pd.StringDtype(): SqlTypeName.VARCHAR, np.datetime64: SqlTypeName.TIMESTAMP, } # Default mapping between SQL types and python types # for values
class MicrosoftAcademicGraph: # constructor def __init__(self, root): self.root = root # return stream path def get_full_path(self, stream_name): return self.root + stream_name + ".txt" # return stream header def get_header(self, stream_name): return self.streams[stream_name] # return stream types and columns with date def get_type(self, stream_name): date_columns = [] schema = {} for field in self.streams[stream_name]: fieldname, fieldtype = field.split(":") nullable = fieldtype.endswith("?") if nullable: fieldtype = fieldtype[:-1] if fieldtype == "DateTime": date_columns.append(fieldname) schema[fieldname] = self.datatypedict[fieldtype] return schema, date_columns # return stream columns names def get_name(self, stream_name): names = [] for field in self.streams[stream_name]: fieldname, fieldtype = field.split(":") names.append(fieldname) return names # return stream Pandas dataFrame def get_data_frame(self, stream_name): column_name = self.get_name(stream_name) column_type, date_columns = self.get_type(stream_name) return pd.read_csv( filepath_or_buffer=self.get_full_path(stream_name), parse_dates=date_columns, low_memory=False, names=column_name, dtype=column_type, date_parser=self.date_parse_func, sep="\t", ) # date parse function date_parse_func = lambda self, c: pd.to_datetime( c, format="%m/%d/%Y %H:%M:%S %p", errors="coerce" ) # 6/24/2016 12:00:00 AM # convert input datatype to Pandas datatype datatypedict = { "int": pd.Int32Dtype(), "uint": pd.UInt32Dtype(), "long": pd.Int64Dtype(), "ulong": pd.UInt64Dtype(), "float": np.float32, "string": np.string_, "DateTime": np.string_, } # define stream dictionary streams = { "Affiliations": [ "AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime", ], "Authors": [ "AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime", ], "ConferenceInstances": [ "ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime", ], "ConferenceSeries": [ "ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime", ], "EntityRelatedEntities": [ "EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float", ], "FieldOfStudyChildren": ["FieldOfStudyId:long", "ChildFieldOfStudyId:long"], "FieldOfStudyExtendedAttributes": [ "FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string", ], "FieldsOfStudy": [ "FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime", ], "Journals": [ "JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime", ], "PaperAbstractsInvertedIndex": ["PaperId:long", "IndexedAbstract:string"], "PaperAuthorAffiliations": [ "PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string", ], "PaperCitationContexts": [ "PaperId:long", "PaperReferenceId:long", "CitationContext:string", ], "PaperExtendedAttributes": [ "PaperId:long", "AttributeType:int", "AttributeValue:string", ], "PaperFieldsOfStudy": ["PaperId:long", "FieldOfStudyId:long", "Score:float"], "PaperRecommendations": [ "PaperId:long", "RecommendedPaperId:long", "Score:float", ], "PaperReferences": ["PaperId:long", "PaperReferenceId:long"], "PaperResources": [ "PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int", ], "PaperUrls": [ "PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string", ], "Papers": [ "PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime", ], "RelatedFieldOfStudy": [ "FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float", ], }
class MicrosoftAcademicGraph(): # constructor def __init__(self, root): self.root = root # return stream path def get_full_path(self, stream_name): return self.root + stream_name + '.txt' # return stream header def get_header(self, stream_name): return self.streams[stream_name] # return stream types and columns with date def get_type(self, stream_name): date_columns = [] schema = {} for field in self.streams[stream_name]: fieldname, fieldtype = field.split(':') nullable = fieldtype.endswith('?') if nullable: fieldtype = fieldtype[:-1] if fieldtype == 'DateTime': date_columns.append(fieldname) schema[fieldname] = self.datatypedict[fieldtype] return schema, date_columns # return stream columns names def get_name(self, stream_name): names = [] for field in self.streams[stream_name]: fieldname, fieldtype = field.split(':') names.append(fieldname) return names # return stream Pandas dataFrame def get_data_frame(self, stream_name): column_name = self.get_name(stream_name) column_type, date_columns = self.get_type(stream_name) return pd.read_csv(filepath_or_buffer=self.get_full_path(stream_name), parse_dates=date_columns, low_memory=False, names=column_name, dtype=column_type, date_parser=self.date_parse_func, sep='\t') # date parse function date_parse_func = lambda self, c: pd.to_datetime( c, format='%m/%d/%Y %H:%M:%S %p', errors='coerce' ) #6/24/2016 12:00:00 AM # convert input datatype to Pandas datatype datatypedict = { 'int': pd.Int32Dtype(), 'uint': pd.UInt32Dtype(), 'long': pd.Int64Dtype(), 'ulong': pd.UInt64Dtype(), 'float': np.float32, 'string': np.string_, 'DateTime': np.string_, } # define stream dictionary streams = { 'Affiliations': [ 'AffiliationId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'GridId:string', 'OfficialPage:string', 'WikiPage:string', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'Latitude:float?', 'Longitude:float?', 'CreatedDate:DateTime' ], 'Authors': [ 'AuthorId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'LastKnownAffiliationId:long?', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime' ], 'ConferenceInstances': [ 'ConferenceInstanceId:long', 'NormalizedName:string', 'DisplayName:string', 'ConferenceSeriesId:long', 'Location:string', 'OfficialUrl:string', 'StartDate:DateTime?', 'EndDate:DateTime?', 'AbstractRegistrationDate:DateTime?', 'SubmissionDeadlineDate:DateTime?', 'NotificationDueDate:DateTime?', 'FinalVersionDueDate:DateTime?', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'Latitude:float?', 'Longitude:float?', 'CreatedDate:DateTime' ], 'ConferenceSeries': [ 'ConferenceSeriesId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime' ], 'EntityRelatedEntities': [ 'EntityId:long', 'EntityType:string', 'RelatedEntityId:long', 'RelatedEntityType:string', 'RelatedType:int', 'Score:float' ], 'FieldOfStudyChildren': ['FieldOfStudyId:long', 'ChildFieldOfStudyId:long'], 'FieldOfStudyExtendedAttributes': ['FieldOfStudyId:long', 'AttributeType:int', 'AttributeValue:string'], 'FieldsOfStudy': [ 'FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime' ], 'Journals': [ 'JournalId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'Issn:string', 'Publisher:string', 'Webpage:string', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime' ], 'PaperAbstractsInvertedIndex': ['PaperId:long', 'IndexedAbstract:string'], 'PaperAuthorAffiliations': [ 'PaperId:long', 'AuthorId:long', 'AffiliationId:long?', 'AuthorSequenceNumber:uint', 'OriginalAuthor:string', 'OriginalAffiliation:string' ], 'PaperCitationContexts': ['PaperId:long', 'PaperReferenceId:long', 'CitationContext:string'], 'PaperExtendedAttributes': ['PaperId:long', 'AttributeType:int', 'AttributeValue:string'], 'PaperFieldsOfStudy': ['PaperId:long', 'FieldOfStudyId:long', 'Score:float'], 'PaperRecommendations': ['PaperId:long', 'RecommendedPaperId:long', 'Score:float'], 'PaperReferences': ['PaperId:long', 'PaperReferenceId:long'], 'PaperResources': [ 'PaperId:long', 'ResourceType:int', 'ResourceUrl:string', 'SourceUrl:string', 'RelationshipType:int' ], 'PaperUrls': [ 'PaperId:long', 'SourceType:int?', 'SourceUrl:string', 'LanguageCode:string' ], 'Papers': [ 'PaperId:long', 'Rank:uint', 'Doi:string', 'DocType:string', 'PaperTitle:string', 'OriginalTitle:string', 'BookTitle:string', 'Year:int?', 'Date:DateTime?', 'OnlineDate:DateTime?', 'Publisher:string', 'JournalId:long?', 'ConferenceSeriesId:long?', 'ConferenceInstanceId:long?', 'Volume:string', 'Issue:string', 'FirstPage:string', 'LastPage:string', 'ReferenceCount:long', 'CitationCount:long', 'EstimatedCitation:long', 'OriginalVenue:string', 'FamilyId:long?', 'CreatedDate:DateTime' ], 'RelatedFieldOfStudy': [ 'FieldOfStudyId1:long', 'Type1:string', 'FieldOfStudyId2:long', 'Type2:string', 'Rank:float' ], }
np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(),
( cudf.Series([1, 2, None, 3], dtype="uint8"), pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), ), ( cudf.Series([23, None, None, 32], dtype="uint16"), pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), ), ( cudf.Series([None, 123, None, 1], dtype="uint32"), pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), pd.Series([234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()), ), ( cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), ), ( cudf.Series([111, None, 222, None, 13], dtype="int16"), pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), ), ( cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), pd.Series([11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()), ), (
def generate_problems_df(runs_df, probe_runs_df=None, problem_paths=None, problem_base_path=None, custom_runs_df=None): if problem_paths is None: problem_paths = runs_df.problem_path elif problem_base_path is not None: problem_paths = [ os.path.relpath(problem_path, problem_base_path) for problem_path in problem_paths ] problem_paths = pd.Index(problem_paths).drop_duplicates() problems_df = pd.DataFrame(index=problem_paths) problems_df.index.name = 'problem_path' # Merge probe run results into `problems_df` if probe_runs_df is not None: problems_df = problems_df.join(probe_runs_df[[ 'problem_path', 'predicates_count', 'functions_count', 'clauses_count' ]].drop_duplicates('problem_path').set_index('problem_path'), rsuffix='probe') # Random solve run stats problem_groups = runs_df.groupby(['problem_path']) problems_df = problems_df.join(problem_groups.size().astype( pd.UInt64Dtype()).to_frame('n_total')) problems_df = problems_df.join( runs_df[runs_df.status == 'completed'].groupby([ 'problem_path' ]).size().astype(pd.UInt64Dtype()).to_frame('n_completed')) problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby( ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_0')) problems_df = problems_df.join(runs_df[runs_df.exit_code == 1].groupby( ['problem_path']).size().astype(pd.UInt64Dtype()).to_frame('n_exit_1')) if 'termination_reason' in runs_df: problems_df = problems_df.join( runs_df[runs_df.termination_reason == 'Refutation'].groupby([ 'problem_path' ]).size().astype(pd.UInt64Dtype()).to_frame('n_refutation')) problems_df = problems_df.join( runs_df[runs_df.termination_reason == 'Satisfiable'].groupby([ 'problem_path' ]).size().astype(pd.UInt64Dtype()).to_frame('n_satisfiable')) problems_df = problems_df.join( runs_df[runs_df.termination_reason == 'Time limit'].groupby([ 'problem_path' ]).size().astype(pd.UInt64Dtype()).to_frame('n_time_limit')) problems_df.fillna( { 'n_total': 0, 'n_completed': 0, 'n_exit_0': 0, 'n_exit_1': 0, 'n_refutation': 0, 'n_satisfiable': 0, 'n_time_limit': 0 }, inplace=True) def variation(a): if (a == 0).all(): # We need to handle this special case explicitly because `scipy.stats.variation` raises an exception on it. return 0 res = scipy.stats.variation(a.astype(np.float), nan_policy='omit') if isinstance(res, np.ma.core.MaskedConstant): # The input array contains all nans. return np.nan return res agg_functions = [np.mean, np.std, variation, np.min, np.max] # Aggregate time measurements across successful runs problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby( ['problem_path']).agg({ field_name: agg_functions for field_name in [ 'time_elapsed_process', 'time_elapsed_vampire', 'saturation_iterations' ] })) # Count unique numbers of saturation iterations across successful runs problems_df = problems_df.join(runs_df[runs_df.exit_code == 0].groupby( ['problem_path']).agg({ 'saturation_iterations': ['nunique'] }).astype(pd.UInt64Dtype())) # Aggregate memory across all runs problems_df = problems_df.join( runs_df.groupby(['problem_path']).agg({'memory_used': agg_functions})) if custom_runs_df is not None: for name, value in custom_runs_df.groupby(['name']): value = value.set_index('problem_path') value = value[['exit_code', 'saturation_iterations']] # https://stackoverflow.com/a/40225796/4054250 value.columns = pd.MultiIndex.from_product([[name], value.columns]) problems_df = problems_df.join(value, rsuffix=name) problems_df.sort_index(inplace=True) return problems_df
np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES FLOAT_TYPES = {"float32", "float64"} SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES DATETIME_TYPES = {
parquet_thrift.ConvertedType.INT_64: np.dtype('int64'), parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'), parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'), parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'), parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'), parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]') } nullable = { np.dtype('int8'): pd.Int8Dtype(), np.dtype('int16'): pd.Int16Dtype(), np.dtype('int32'): pd.Int32Dtype(), np.dtype('int64'): pd.Int64Dtype(), np.dtype('uint8'): pd.UInt8Dtype(), np.dtype('uint16'): pd.UInt16Dtype(), np.dtype('uint32'): pd.UInt32Dtype(), np.dtype('uint64'): pd.UInt64Dtype(), np.dtype('bool'): pd.BooleanDtype() } pandas_nullable = { "Int8": pd.Int8Dtype(), "Int16": pd.Int16Dtype(), "Int32": pd.Int32Dtype(), "Int64": pd.Int64Dtype(), "UInt8": pd.UInt8Dtype(), "UInt16": pd.UInt16Dtype(), "UInt32": pd.UInt32Dtype(), "UInt64": pd.UInt64Dtype(), "boolean": pd.BooleanDtype() }
def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): pyarrow_data = pa.array(data, **pyarrow_kwargs) cudf_from_pyarrow = as_column(pyarrow_data) expected = as_column(data, **cudf_kwargs) assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected)) @pytest.mark.parametrize( "pd_dtype,expect_dtype", [ # TODO: Nullable float is coming (pd.StringDtype(), np.dtype("O")), (pd.UInt8Dtype(), np.dtype("uint8")), (pd.UInt16Dtype(), np.dtype("uint16")), (pd.UInt32Dtype(), np.dtype("uint32")), (pd.UInt64Dtype(), np.dtype("uint64")), (pd.Int8Dtype(), np.dtype("int8")), (pd.Int16Dtype(), np.dtype("int16")), (pd.Int32Dtype(), np.dtype("int32")), (pd.Int64Dtype(), np.dtype("int64")), (pd.BooleanDtype(), np.dtype("bool")), ], ) def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else: data = [1, pd.NA, 3, pd.NA, 5]
( cudf.Series([1, 2, None, 3], dtype="uint8"), pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), ), ( cudf.Series([23, None, None, 32], dtype="uint16"), pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), ), ( cudf.Series([None, 123, None, 1], dtype="uint32"), pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), pd.Series( [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() ), ), ( cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), ), ( cudf.Series([111, None, 222, None, 13], dtype="int16"), pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), ), ( cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), pd.Series( [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() ),
import datetime import numpy as np import pandas as pd from .librdata import Writer from .custom_errors import PyreadrError # configuration int_types = {np.dtype('int32'), np.dtype('int16'), np.dtype('int8'), np.dtype('uint8'), np.dtype('uint16'), np.int32, np.int16, np.int8, np.uint8, np.uint16} int_mixed_types = {pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype()} float_types = {np.dtype('int64'), np.dtype('uint64'), np.dtype('uint32'), np.dtype('float'), np.int64, np.uint64, np.uint32, np.float, pd.Int64Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype()} datetime_types = {datetime.datetime, np.datetime64} pyreadr_to_librdata_types = {"INTEGER": "INTEGER", "NUMERIC": "NUMERIC", "LOGICAL": "LOGICAL", "CHARACTER": "CHARACTER", "OBJECT": "CHARACTER", "DATE": "CHARACTER", "DATETIME":"CHARACTER"} librdata_min_integer = -2147483648 def get_pyreadr_column_types(df): """ From a pandas data frame, get an OrderedDict with column name as key and pyreadr column type as value, and also a list with boolean values indicating if the column has missing values (np.nan).
type = pd.Int8Dtype() bit_width: int = 8 ############################################################################### # unsigned integer ############################################################################### _register_numpy_numbers( builtin_name="uint", pandera_name="UInt", sizes=[64, 32, 16, 8], ) @Engine.register_dtype(equivalents=[pd.UInt64Dtype, pd.UInt64Dtype()]) @immutable class UINT64(DataType, dtypes.UInt): """Semantic representation of a :class:`pandas.UInt64Dtype`.""" type = pd.UInt64Dtype() bit_width: int = 64 @Engine.register_dtype(equivalents=[pd.UInt32Dtype, pd.UInt32Dtype()]) @immutable class UINT32(UINT64): """Semantic representation of a :class:`pandas.UInt32Dtype`.""" type = pd.UInt32Dtype() bit_width: int = 32
class UINT64(DataType, dtypes.UInt): """Semantic representation of a :class:`pandas.UInt64Dtype`.""" type = pd.UInt64Dtype() bit_width: int = 64
# Copyright (c) 2020, NVIDIA CORPORATION. import random import pandas as pd import pyarrow as pa pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.bool_(): pd.BooleanDtype(), pa.string(): pd.StringDtype(), } def _generate_rand_meta(obj, dtypes_list): obj._current_params = {} num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) dtypes_meta = [] for _ in range(num_cols): dtype = random.choice(dtypes_list) null_frequency = random.uniform(0, 1)
"int8": "Int8", "datetime64[D]": "Date", "datetime64[ns]": "DateTime", } PD2CH = keymap(np.dtype, MAPPING) PD_INT_TYPES = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype(), ] for typ in PD_INT_TYPES: PD2CH[typ] = f"Nullable({typ.name})" CH2PD = itemmap(reversed, MAPPING) CH2PD["Null"] = "object" CH2PD["Nothing"] = "object" NULLABLE_COLS = [ "Float64", "Float32", "String", ]