def test_union_dtypes(left, right, expected): left = pandas_dtype(left) right = pandas_dtype(right) a = pd.Index([], dtype=left) b = pd.Index([], dtype=right) result = (a | b).dtype assert result == expected
def test_union_dtypes(left, right, expected, names): left = pandas_dtype(left) right = pandas_dtype(right) a = Index([], dtype=left, name=names[0]) b = Index([], dtype=right, name=names[1]) result = a.union(b) assert result.dtype == expected assert result.name == names[2] # Testing name retention # TODO: pin down desired dtype; do we want it to be commutative? result = a.intersection(b) assert result.name == names[2]
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, ArrowStringDtype): if copy: return self.copy() return self if pa is None or self._force_use_pandas: # pyarrow not installed if isinstance(dtype, ArrowDtype): dtype = dtype.type return type(self)(pd.Series(self.to_numpy()).astype(dtype, copy=copy)) # try to slice 1 record to get the result dtype test_array = self._arrow_array.slice(0, 1).to_pandas() test_result_array = test_array.astype(dtype).array result_array = \ type(test_result_array)( np.full(self.shape, test_result_array.dtype.na_value, dtype=np.asarray(test_result_array).dtype)) start = 0 # use chunks to do astype for chunk_array in self._arrow_array.chunks: result_array[start: start + len(chunk_array)] = \ chunk_array.to_pandas().astype(dtype).array start += len(chunk_array) return result_array
def astype(self, dtype, copy=True): msg = f'cannot astype from {self.dtype} to {dtype}' dtype = pandas_dtype(dtype) if isinstance(dtype, ArrowListDtype): if self.dtype == dtype: if copy: return self.copy() return self else: if self._use_arrow: try: arrow_array = self._arrow_array.cast(dtype.arrow_type) return ArrowListArray(arrow_array) except (NotImplementedError, pa.ArrowInvalid): raise TypeError(msg) else: def f(x): return pd.Series(x).astype( dtype.value_type.type).tolist() try: arr = pd.Series(self._ndarray) ret = arr.map(f).to_numpy() return ArrowStringArray(ret) except ValueError: raise TypeError(msg) try: return super().astype(dtype, copy=copy) except ValueError: raise TypeError(msg)
def convert_type_to_pandas_dtype( type_: Union[type, Type], default_type: np.dtype = np.float64) -> np.dtype: """ Convert a native python type or typing type annotation to a numpy dtype. Parameters ---------- type_ : Union[type, Type] the type to convert default_type : np.dtype, optional default dtype used for when type is Unknown, by default np.float64 Returns ------- np.dtype the converted type """ if type_ is Unknown: nptype = default_type elif isinstance(type_, type) and issubclass(type_, str): nptype = pd.StringDtype() else: try: nptype = pandas_dtype(type_) except TypeError: # assume it's an object type nptype = np.object_ return nptype
def is_categorical_dtype(obj): """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype is a pandas CategoricalDtype. """ from cudf.dataframe import Series, Index from cudf.dataframe.column import Column from cudf.dataframe.index import CategoricalIndex from cudf.dataframe.categorical import CategoricalColumn if obj is None: return False if obj is CategoricalDtypeType: return True if isinstance(obj, str) and obj == "category": return True if hasattr(obj, "type") and obj.type is CategoricalDtypeType: return True if isinstance( obj, ( CategoricalDtype, CategoricalIndex, CategoricalColumn, pd.Categorical, pd.CategoricalIndex, ), ): return True if isinstance(obj, (Index, Series, Column, pd.Index, pd.Series, np.ndarray)): return is_categorical_dtype(obj.dtype) return pandas_dtype(obj).type is CategoricalDtypeType
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, DateDtype): data = self.copy() if copy else self else: data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min) return data
def __init__(self, dtype): if isinstance(dtype, type(self)): dtype = dtype.value_type if pa and isinstance(dtype, pa.DataType): dtype = dtype.to_pandas_dtype() dtype = pandas_dtype(dtype) if is_string_dtype(dtype) and not isinstance(dtype, ArrowStringDtype): # convert string dtype to arrow string dtype dtype = ArrowStringDtype() self._value_type = dtype
def test_astype(self): result = self.float.astype(object) assert result.equals(self.float) assert self.float.equals(result) self.check_is_index(result) i = self.mixed.copy() i.name = "foo" result = i.astype(object) assert result.equals(i) assert i.equals(result) self.check_is_index(result) # GH 12881 # a float astype int for dtype in ["int16", "int32", "int64"]: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = Int64Index([0, 1, 2]) tm.assert_index_equal(result, expected) i = Float64Index([0, 1.1, 2]) result = i.astype(dtype) expected = Int64Index([0, 1, 2]) tm.assert_index_equal(result, expected) for dtype in ["float32", "float64"]: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = i tm.assert_index_equal(result, expected) i = Float64Index([0, 1.1, 2]) result = i.astype(dtype) expected = Index(i.values.astype(dtype)) tm.assert_index_equal(result, expected) # invalid for dtype in ["M8[ns]", "m8[ns]"]: msg = ( "Cannot convert Float64Index to dtype {}; integer values" " are required for conversion" ).format(pandas_dtype(dtype)) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) # GH 13149 for dtype in ["int16", "int32", "int64"]: i = Float64Index([0, 1.1, np.NAN]) msg = "Cannot convert NA to integer" with pytest.raises(ValueError, match=msg): i.astype(dtype)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, RaggedDtype): if copy: return self.copy() return self elif is_extension_array_dtype(dtype): return dtype.construct_array_type()._from_sequence( np.asarray(self)) return np.array([v for v in self], dtype=dtype, copy=copy)
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values[1]#.encode("latin1") if compress: if compress == u"zlib": _check_zlib() decompress = zlib.decompress elif compress == u"blosc": _check_blosc() decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( "copying data after decompressing; this may mean that" " decompress is caching its result", PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the string into a numpy array. return np.frombuffer(values, dtype=dtype)
def test_astype(self): result = self.float.astype(object) assert result.equals(self.float) assert self.float.equals(result) self.check_is_index(result) i = self.mixed.copy() i.name = 'foo' result = i.astype(object) assert result.equals(i) assert i.equals(result) self.check_is_index(result) # GH 12881 # a float astype int for dtype in ['int16', 'int32', 'int64']: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = Int64Index([0, 1, 2]) tm.assert_index_equal(result, expected) i = Float64Index([0, 1.1, 2]) result = i.astype(dtype) expected = Int64Index([0, 1, 2]) tm.assert_index_equal(result, expected) for dtype in ['float32', 'float64']: i = Float64Index([0, 1, 2]) result = i.astype(dtype) expected = i tm.assert_index_equal(result, expected) i = Float64Index([0, 1.1, 2]) result = i.astype(dtype) expected = Index(i.values.astype(dtype)) tm.assert_index_equal(result, expected) # invalid for dtype in ['M8[ns]', 'm8[ns]']: msg = ("Cannot convert Float64Index to dtype {}; integer values" " are required for conversion").format(pandas_dtype(dtype)) with pytest.raises(TypeError, match=re.escape(msg)): i.astype(dtype) # GH 13149 for dtype in ['int16', 'int32', 'int64']: i = Float64Index([0, 1.1, np.NAN]) msg = "Cannot convert NA to integer" with pytest.raises(ValueError, match=msg): i.astype(dtype)
def is_categorical_dtype(obj): """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype is a pandas CategoricalDtype. """ if obj is None: return False if isinstance(obj, cudf.CategoricalDtype): return True if obj is cudf.CategoricalDtype: return True if isinstance(obj, np.dtype): return False if isinstance(obj, CategoricalDtype): return True if obj is CategoricalDtype: return True if obj is CategoricalDtypeType: return True if isinstance(obj, str) and obj == "category": return True if isinstance( obj, ( CategoricalDtype, cudf.core.index.CategoricalIndex, cudf.core.column.CategoricalColumn, pd.Categorical, pd.CategoricalIndex, ), ): return True if isinstance(obj, np.ndarray): return False if isinstance( obj, ( cudf.Index, cudf.Series, cudf.core.column.ColumnBase, pd.Index, pd.Series, ), ): return is_categorical_dtype(obj.dtype) if hasattr(obj, "type"): if obj.type is CategoricalDtypeType: return True return pandas_dtype(obj).type is CategoricalDtypeType
def dtype_to_spectrum(dtype): """convert pandas dtype to equivalent redshift spectrum schema column value.""" try: return { pandas_dtype(np.float64): 'FLOAT8', pandas_dtype(np.object): 'VARCHAR(8192)', pandas_dtype(np.int64): 'INT8', pandas_dtype(np.bool): 'BOOL', pandas_dtype(np.datetime64): 'TIMESTAMP', pandas_dtype('<M8[s]'): 'TIMESTAMP' }[dtype] except KeyError: return 'TEXT'
def astype(self, dtype, copy=True): msg = f'cannot astype from {self.dtype} to {dtype}' dtype = pandas_dtype(dtype) if isinstance(dtype, ArrowListDtype): if self.dtype == dtype: if copy: return self.copy() return self else: try: arrow_array = self._arrow_array.cast(dtype.arrow_type) return ArrowListArray(arrow_array) except (NotImplementedError, pa.ArrowInvalid): raise TypeError(msg) try: return super().astype(dtype, copy=copy) except ValueError: raise TypeError(msg)
def pandas_on_spark_type( tpe: Union[str, type, Dtype]) -> Tuple[Dtype, types.DataType]: """ Convert input into a pandas only dtype object or a numpy dtype object, and its corresponding Spark DataType. Parameters ---------- tpe : object to be converted Returns ------- tuple of np.dtype or a pandas dtype, and Spark DataType Raises ------ TypeError if not a dtype Examples -------- >>> pandas_on_spark_type(int) (dtype('int64'), LongType()) >>> pandas_on_spark_type(str) (dtype('<U'), StringType()) >>> pandas_on_spark_type(datetime.date) (dtype('O'), DateType()) >>> pandas_on_spark_type(datetime.datetime) (dtype('<M8[ns]'), TimestampType()) >>> pandas_on_spark_type(datetime.timedelta) (dtype('<m8[ns]'), DayTimeIntervalType(0, 3)) >>> pandas_on_spark_type(List[bool]) (dtype('O'), ArrayType(BooleanType(), True)) """ try: dtype = pandas_dtype(tpe) spark_type = as_spark_type(dtype) except TypeError: spark_type = as_spark_type(tpe) dtype = spark_type_to_pandas_dtype(spark_type) return dtype, spark_type
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, ArrowStringDtype): if copy: return self.copy() return self # try to slice 1 record to get the result dtype test_array = self._arrow_array.slice(0, 1).to_pandas() test_result_array = test_array.astype(dtype).array result_array = \ type(test_result_array)( np.full(self.shape, test_result_array.dtype.na_value, dtype=np.asarray(test_result_array).dtype)) start = 0 # use chunks to do astype for chunk_array in self._arrow_array.chunks: result_array[start: start + len(chunk_array)] = \ chunk_array.to_pandas().astype(dtype).array start += len(chunk_array) return result_array
def time_pandas_dtype_invalid(self, dtype): try: pandas_dtype(self.data_dict[dtype]) except TypeError: pass
def time_pandas_dtype(self, dtype): pandas_dtype(dtype)
def time_pandas_dtype(self, dtype): pandas_dtype(dtype)
def time_pandas_dtype_invalid(self, dtype): try: pandas_dtype(self.data_dict[dtype]) except TypeError: pass
def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype( any_signed_int_ea_dtype, any_signed_int_numpy_dtype): # GH#43038 assert not pandas_dtype( any_signed_int_ea_dtype) == any_signed_int_numpy_dtype
def test_type_comparison_with_real_numpy_dtype(any_real_numpy_dtype): # GH#43038 assert pandas_dtype(any_real_numpy_dtype) == any_real_numpy_dtype
def test_type_comparison_with_numeric_ea_dtype(any_numeric_ea_dtype): # GH#43038 assert pandas_dtype(any_numeric_ea_dtype) == any_numeric_ea_dtype
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get(u"typ") if typ is None: return obj elif typ == u"timestamp": freq = obj[u"freq"] if "freq" in obj else obj[u"offset"] return Timestamp(obj[u"value"], tz=obj[u"tz"], freq=freq) elif typ == u"nat": return NaT elif typ == u"period": return Period(ordinal=obj[u"ordinal"], freq=obj[u"freq"]) elif typ == u"index": dtype = dtype_for(obj[u"dtype"]) data = unconvert(obj[u"data"], dtype, obj.get(u"compress")) return globals()[obj[u"klass"]](data, dtype=dtype, name=obj[u"name"]) elif typ == u"range_index": return globals()[obj[u"klass"]](obj[u"start"], obj[u"stop"], obj[u"step"], name=obj[u"name"]) elif typ == u"multi_index": dtype = dtype_for(obj[u"dtype"]) data = unconvert(obj[u"data"], dtype, obj.get(u"compress")) data = [tuple(x) for x in data] return globals()[obj[u"klass"]].from_tuples(data, names=obj[u"names"]) elif typ == u"period_index": data = unconvert(obj[u"data"], np.int64, obj.get(u"compress")) d = dict(name=obj[u"name"], freq=obj[u"freq"]) if _is_pandas_legacy_version: # legacy return globals()[obj[u"klass"]](data, **d) else: freq = d['freq'] if freq is None: raise ValueError( 'freq is not specified and cannot be inferred') values = [Period(ordinal=x, freq=freq) for x in data] return PeriodIndex(values) #return globals()[obj[u"klass"]]._from_ordinals(data, **d) elif typ == u"datetime_index": data = unconvert(obj[u"data"], np.int64, obj.get(u"compress")) d = dict(name=obj[u"name"], freq=obj[u"freq"]) #, verify_integrity=False) result = globals()[obj[u"klass"]](data, **d) tz = obj[u"tz"] # reverse tz conversion if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) return result elif typ == u"category": from_codes = globals()[obj[u"klass"]].from_codes return from_codes(codes=obj[u"codes"], categories=obj[u"categories"], ordered=obj[u"ordered"]) elif typ == u"series": dtype = dtype_for(obj[u"dtype"]) pd_dtype = pandas_dtype(dtype) index = obj[u"index"] result = globals()[obj[u"klass"]]( unconvert(obj[u"data"], dtype, obj[u"compress"]), index=index, dtype=pd_dtype, name=obj[u"name"], ) return result elif typ == u"block_manager": axes = obj[u"axes"] def create_block(b): values = _safe_reshape( unconvert(b[u"values"], dtype_for(b[u"dtype"]), b[u"compress"]), b[u"shape"], ) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u"locs" in b: placement = b[u"locs"] else: placement = axes[0].get_indexer(b[u"items"]) klass = getattr(internals, b[u"klass"]) if klass == DatetimeTZBlock: raise ValueError( "Lost the ability to parse datetime with timezone. Sorry") return make_block( values=values.copy(), klass=getattr(internals, b[u"klass"]), placement=placement, dtype=b[u"dtype"], ) blocks = [create_block(b) for b in obj[u"blocks"]] return globals()[obj[u"klass"]](BlockManager(blocks, axes)) elif typ == u"datetime": return parse(obj[u"data"]) elif typ == u"datetime64": return np.datetime64(parse(obj[u"data"])) elif typ == u"date": return parse(obj[u"data"]).date() elif typ == u"timedelta": return timedelta(*obj[u"data"]) elif typ == u"timedelta64": return np.timedelta64(int(obj[u"data"])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return globals()[obj['klass']]( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) # elif typ == 'sparse_panel': # return globals()[obj['klass']]( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == u"block_index": return globals()[obj[u"klass"]](obj[u"length"], obj[u"blocs"], obj[u"blengths"]) elif typ == u"int_index": return globals()[obj[u"klass"]](obj[u"length"], obj[u"indices"]) elif typ == u"ndarray": return unconvert(obj[u"data"], np.typeDict[obj[u"dtype"]], obj.get(u"compress")).reshape(obj[u"shape"]) elif typ == u"np_scalar": if obj.get(u"sub_typ") == u"np_complex": return c2f(obj[u"real"], obj[u"imag"], obj[u"dtype"]) else: dtype = dtype_for(obj[u"dtype"]) try: return dtype(obj[u"data"]) except: return dtype.type(obj[u"data"]) elif typ == u"np_complex": return complex(obj[u"real"] + u"+" + obj[u"imag"] + u"j") elif isinstance(obj, (dict, list, set)): return obj else: return obj