Example #1
0
def test_union_dtypes(left, right, expected):
    left = pandas_dtype(left)
    right = pandas_dtype(right)
    a = pd.Index([], dtype=left)
    b = pd.Index([], dtype=right)
    result = (a | b).dtype
    assert result == expected
Example #2
0
def test_union_dtypes(left, right, expected, names):
    left = pandas_dtype(left)
    right = pandas_dtype(right)
    a = Index([], dtype=left, name=names[0])
    b = Index([], dtype=right, name=names[1])
    result = a.union(b)
    assert result.dtype == expected
    assert result.name == names[2]

    # Testing name retention
    # TODO: pin down desired dtype; do we want it to be commutative?
    result = a.intersection(b)
    assert result.name == names[2]
Example #3
0
    def astype(self, dtype, copy=True):
        dtype = pandas_dtype(dtype)
        if isinstance(dtype, ArrowStringDtype):
            if copy:
                return self.copy()
            return self

        if pa is None or self._force_use_pandas:
            # pyarrow not installed
            if isinstance(dtype, ArrowDtype):
                dtype = dtype.type
            return type(self)(pd.Series(self.to_numpy()).astype(dtype,
                                                                copy=copy))

        # try to slice 1 record to get the result dtype
        test_array = self._arrow_array.slice(0, 1).to_pandas()
        test_result_array = test_array.astype(dtype).array

        result_array = \
            type(test_result_array)(
                np.full(self.shape, test_result_array.dtype.na_value,
                        dtype=np.asarray(test_result_array).dtype))

        start = 0
        # use chunks to do astype
        for chunk_array in self._arrow_array.chunks:
            result_array[start: start + len(chunk_array)] = \
                chunk_array.to_pandas().astype(dtype).array
            start += len(chunk_array)
        return result_array
Example #4
0
    def astype(self, dtype, copy=True):
        msg = f'cannot astype from {self.dtype} to {dtype}'
        dtype = pandas_dtype(dtype)
        if isinstance(dtype, ArrowListDtype):
            if self.dtype == dtype:
                if copy:
                    return self.copy()
                return self
            else:
                if self._use_arrow:
                    try:
                        arrow_array = self._arrow_array.cast(dtype.arrow_type)
                        return ArrowListArray(arrow_array)
                    except (NotImplementedError, pa.ArrowInvalid):
                        raise TypeError(msg)
                else:

                    def f(x):
                        return pd.Series(x).astype(
                            dtype.value_type.type).tolist()

                    try:
                        arr = pd.Series(self._ndarray)
                        ret = arr.map(f).to_numpy()
                        return ArrowStringArray(ret)
                    except ValueError:
                        raise TypeError(msg)

        try:
            return super().astype(dtype, copy=copy)
        except ValueError:
            raise TypeError(msg)
Example #5
0
def convert_type_to_pandas_dtype(
        type_: Union[type, Type],
        default_type: np.dtype = np.float64) -> np.dtype:
    """
    Convert a native python type or typing type annotation to a numpy dtype.

    Parameters
    ----------
    type_ : Union[type, Type]
        the type to convert
    default_type : np.dtype, optional
        default dtype used for when type is Unknown, by default np.float64

    Returns
    -------
    np.dtype
        the converted type
    """
    if type_ is Unknown:
        nptype = default_type
    elif isinstance(type_, type) and issubclass(type_, str):
        nptype = pd.StringDtype()
    else:
        try:
            nptype = pandas_dtype(type_)
        except TypeError:
            # assume it's an object type
            nptype = np.object_
    return nptype
Example #6
0
def is_categorical_dtype(obj):
    """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype
    is a pandas CategoricalDtype.
    """
    from cudf.dataframe import Series, Index
    from cudf.dataframe.column import Column
    from cudf.dataframe.index import CategoricalIndex
    from cudf.dataframe.categorical import CategoricalColumn

    if obj is None:
        return False
    if obj is CategoricalDtypeType:
        return True
    if isinstance(obj, str) and obj == "category":
        return True
    if hasattr(obj, "type") and obj.type is CategoricalDtypeType:
        return True
    if isinstance(
            obj,
        (
            CategoricalDtype,
            CategoricalIndex,
            CategoricalColumn,
            pd.Categorical,
            pd.CategoricalIndex,
        ),
    ):
        return True
    if isinstance(obj,
                  (Index, Series, Column, pd.Index, pd.Series, np.ndarray)):
        return is_categorical_dtype(obj.dtype)

    return pandas_dtype(obj).type is CategoricalDtypeType
Example #7
0
    def astype(self, dtype, copy=True):
        dtype = pandas_dtype(dtype)

        if isinstance(dtype, DateDtype):
            data = self.copy() if copy else self
        else:
            data = self.to_numpy(dtype=dtype, copy=copy, na_value=dt.date.min)

        return data
Example #8
0
    def __init__(self, dtype):
        if isinstance(dtype, type(self)):
            dtype = dtype.value_type
        if pa and isinstance(dtype, pa.DataType):
            dtype = dtype.to_pandas_dtype()

        dtype = pandas_dtype(dtype)
        if is_string_dtype(dtype) and not isinstance(dtype, ArrowStringDtype):
            # convert string dtype to arrow string dtype
            dtype = ArrowStringDtype()

        self._value_type = dtype
Example #9
0
    def test_astype(self):

        result = self.float.astype(object)
        assert result.equals(self.float)
        assert self.float.equals(result)
        self.check_is_index(result)

        i = self.mixed.copy()
        i.name = "foo"
        result = i.astype(object)
        assert result.equals(i)
        assert i.equals(result)
        self.check_is_index(result)

        # GH 12881
        # a float astype int
        for dtype in ["int16", "int32", "int64"]:
            i = Float64Index([0, 1, 2])
            result = i.astype(dtype)
            expected = Int64Index([0, 1, 2])
            tm.assert_index_equal(result, expected)

            i = Float64Index([0, 1.1, 2])
            result = i.astype(dtype)
            expected = Int64Index([0, 1, 2])
            tm.assert_index_equal(result, expected)

        for dtype in ["float32", "float64"]:
            i = Float64Index([0, 1, 2])
            result = i.astype(dtype)
            expected = i
            tm.assert_index_equal(result, expected)

            i = Float64Index([0, 1.1, 2])
            result = i.astype(dtype)
            expected = Index(i.values.astype(dtype))
            tm.assert_index_equal(result, expected)

        # invalid
        for dtype in ["M8[ns]", "m8[ns]"]:
            msg = (
                "Cannot convert Float64Index to dtype {}; integer values"
                " are required for conversion"
            ).format(pandas_dtype(dtype))
            with pytest.raises(TypeError, match=re.escape(msg)):
                i.astype(dtype)

        # GH 13149
        for dtype in ["int16", "int32", "int64"]:
            i = Float64Index([0, 1.1, np.NAN])
            msg = "Cannot convert NA to integer"
            with pytest.raises(ValueError, match=msg):
                i.astype(dtype)
Example #10
0
    def astype(self, dtype, copy=True):
        dtype = pandas_dtype(dtype)
        if isinstance(dtype, RaggedDtype):
            if copy:
                return self.copy()
            return self

        elif is_extension_array_dtype(dtype):
            return dtype.construct_array_type()._from_sequence(
                np.asarray(self))

        return np.array([v for v in self], dtype=dtype, copy=copy)
Example #11
0
def unconvert(values, dtype, compress=None):

    as_is_ext = isinstance(values, ExtType) and values.code == 0

    if as_is_ext:
        values = values.data

    if is_categorical_dtype(dtype):
        return values

    elif is_object_dtype(dtype):
        return np.array(values, dtype=object)

    dtype = pandas_dtype(dtype).base

    if not as_is_ext:
        values = values[1]#.encode("latin1")

    if compress:
        if compress == u"zlib":
            _check_zlib()
            decompress = zlib.decompress
        elif compress == u"blosc":
            _check_blosc()
            decompress = blosc.decompress
        else:
            raise ValueError("compress must be one of 'zlib' or 'blosc'")

        try:
            return np.frombuffer(
                _move_into_mutable_buffer(decompress(values)), dtype=dtype
            )
        except _BadMove as e:
            # Pull the decompressed data off of the `_BadMove` exception.
            # We don't just store this in the locals because we want to
            # minimize the risk of giving users access to a `bytes` object
            # whose data is also given to a mutable buffer.
            values = e.args[0]
            if len(values) > 1:
                # The empty string and single characters are memoized in many
                # string creating functions in the capi. This case should not
                # warn even though we need to make a copy because we are only
                # copying at most 1 byte.
                warnings.warn(
                    "copying data after decompressing; this may mean that"
                    " decompress is caching its result",
                    PerformanceWarning,
                )
                # fall through to copying `np.fromstring`

    # Copy the string into a numpy array.
    return np.frombuffer(values, dtype=dtype)
Example #12
0
    def test_astype(self):

        result = self.float.astype(object)
        assert result.equals(self.float)
        assert self.float.equals(result)
        self.check_is_index(result)

        i = self.mixed.copy()
        i.name = 'foo'
        result = i.astype(object)
        assert result.equals(i)
        assert i.equals(result)
        self.check_is_index(result)

        # GH 12881
        # a float astype int
        for dtype in ['int16', 'int32', 'int64']:
            i = Float64Index([0, 1, 2])
            result = i.astype(dtype)
            expected = Int64Index([0, 1, 2])
            tm.assert_index_equal(result, expected)

            i = Float64Index([0, 1.1, 2])
            result = i.astype(dtype)
            expected = Int64Index([0, 1, 2])
            tm.assert_index_equal(result, expected)

        for dtype in ['float32', 'float64']:
            i = Float64Index([0, 1, 2])
            result = i.astype(dtype)
            expected = i
            tm.assert_index_equal(result, expected)

            i = Float64Index([0, 1.1, 2])
            result = i.astype(dtype)
            expected = Index(i.values.astype(dtype))
            tm.assert_index_equal(result, expected)

        # invalid
        for dtype in ['M8[ns]', 'm8[ns]']:
            msg = ("Cannot convert Float64Index to dtype {}; integer values"
                   " are required for conversion").format(pandas_dtype(dtype))
            with pytest.raises(TypeError, match=re.escape(msg)):
                i.astype(dtype)

        # GH 13149
        for dtype in ['int16', 'int32', 'int64']:
            i = Float64Index([0, 1.1, np.NAN])
            msg = "Cannot convert NA to integer"
            with pytest.raises(ValueError, match=msg):
                i.astype(dtype)
Example #13
0
File: dtypes.py Project: vuule/cudf
def is_categorical_dtype(obj):
    """Infer whether a given pandas, numpy, or cuDF Column, Series, or dtype
    is a pandas CategoricalDtype.
    """
    if obj is None:
        return False
    if isinstance(obj, cudf.CategoricalDtype):
        return True
    if obj is cudf.CategoricalDtype:
        return True
    if isinstance(obj, np.dtype):
        return False
    if isinstance(obj, CategoricalDtype):
        return True
    if obj is CategoricalDtype:
        return True
    if obj is CategoricalDtypeType:
        return True
    if isinstance(obj, str) and obj == "category":
        return True
    if isinstance(
            obj,
        (
            CategoricalDtype,
            cudf.core.index.CategoricalIndex,
            cudf.core.column.CategoricalColumn,
            pd.Categorical,
            pd.CategoricalIndex,
        ),
    ):
        return True
    if isinstance(obj, np.ndarray):
        return False
    if isinstance(
            obj,
        (
            cudf.Index,
            cudf.Series,
            cudf.core.column.ColumnBase,
            pd.Index,
            pd.Series,
        ),
    ):
        return is_categorical_dtype(obj.dtype)
    if hasattr(obj, "type"):
        if obj.type is CategoricalDtypeType:
            return True
    return pandas_dtype(obj).type is CategoricalDtypeType
Example #14
0
def dtype_to_spectrum(dtype):
    """convert pandas dtype to equivalent redshift spectrum schema column value."""
    try:
        return {
            pandas_dtype(np.float64): 'FLOAT8',
            pandas_dtype(np.object): 'VARCHAR(8192)',
            pandas_dtype(np.int64): 'INT8',
            pandas_dtype(np.bool): 'BOOL',
            pandas_dtype(np.datetime64): 'TIMESTAMP',
            pandas_dtype('<M8[s]'): 'TIMESTAMP'
        }[dtype]
    except KeyError:
        return 'TEXT'
Example #15
0
    def astype(self, dtype, copy=True):
        msg = f'cannot astype from {self.dtype} to {dtype}'
        dtype = pandas_dtype(dtype)
        if isinstance(dtype, ArrowListDtype):
            if self.dtype == dtype:
                if copy:
                    return self.copy()
                return self
            else:
                try:
                    arrow_array = self._arrow_array.cast(dtype.arrow_type)
                    return ArrowListArray(arrow_array)
                except (NotImplementedError, pa.ArrowInvalid):
                    raise TypeError(msg)

        try:
            return super().astype(dtype, copy=copy)
        except ValueError:
            raise TypeError(msg)
Example #16
0
def pandas_on_spark_type(
        tpe: Union[str, type, Dtype]) -> Tuple[Dtype, types.DataType]:
    """
    Convert input into a pandas only dtype object or a numpy dtype object,
    and its corresponding Spark DataType.

    Parameters
    ----------
    tpe : object to be converted

    Returns
    -------
    tuple of np.dtype or a pandas dtype, and Spark DataType

    Raises
    ------
    TypeError if not a dtype

    Examples
    --------
    >>> pandas_on_spark_type(int)
    (dtype('int64'), LongType())
    >>> pandas_on_spark_type(str)
    (dtype('<U'), StringType())
    >>> pandas_on_spark_type(datetime.date)
    (dtype('O'), DateType())
    >>> pandas_on_spark_type(datetime.datetime)
    (dtype('<M8[ns]'), TimestampType())
    >>> pandas_on_spark_type(datetime.timedelta)
    (dtype('<m8[ns]'), DayTimeIntervalType(0, 3))
    >>> pandas_on_spark_type(List[bool])
    (dtype('O'), ArrayType(BooleanType(), True))
    """
    try:
        dtype = pandas_dtype(tpe)
        spark_type = as_spark_type(dtype)
    except TypeError:
        spark_type = as_spark_type(tpe)
        dtype = spark_type_to_pandas_dtype(spark_type)
    return dtype, spark_type
Example #17
0
    def astype(self, dtype, copy=True):
        dtype = pandas_dtype(dtype)
        if isinstance(dtype, ArrowStringDtype):
            if copy:
                return self.copy()
            return self

        # try to slice 1 record to get the result dtype
        test_array = self._arrow_array.slice(0, 1).to_pandas()
        test_result_array = test_array.astype(dtype).array

        result_array = \
            type(test_result_array)(
                np.full(self.shape, test_result_array.dtype.na_value,
                        dtype=np.asarray(test_result_array).dtype))

        start = 0
        # use chunks to do astype
        for chunk_array in self._arrow_array.chunks:
            result_array[start: start + len(chunk_array)] = \
                chunk_array.to_pandas().astype(dtype).array
            start += len(chunk_array)
        return result_array
Example #18
0
 def time_pandas_dtype_invalid(self, dtype):
     try:
         pandas_dtype(self.data_dict[dtype])
     except TypeError:
         pass
Example #19
0
 def time_pandas_dtype(self, dtype):
     pandas_dtype(dtype)
Example #20
0
 def time_pandas_dtype(self, dtype):
     pandas_dtype(dtype)
Example #21
0
 def time_pandas_dtype_invalid(self, dtype):
     try:
         pandas_dtype(self.data_dict[dtype])
     except TypeError:
         pass
Example #22
0
def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype(
        any_signed_int_ea_dtype, any_signed_int_numpy_dtype):
    # GH#43038
    assert not pandas_dtype(
        any_signed_int_ea_dtype) == any_signed_int_numpy_dtype
Example #23
0
def test_type_comparison_with_real_numpy_dtype(any_real_numpy_dtype):
    # GH#43038
    assert pandas_dtype(any_real_numpy_dtype) == any_real_numpy_dtype
Example #24
0
def test_type_comparison_with_numeric_ea_dtype(any_numeric_ea_dtype):
    # GH#43038
    assert pandas_dtype(any_numeric_ea_dtype) == any_numeric_ea_dtype
Example #25
0
def decode(obj):
    """
    Decoder for deserializing numpy data types.
    """

    typ = obj.get(u"typ")
    if typ is None:
        return obj
    elif typ == u"timestamp":
        freq = obj[u"freq"] if "freq" in obj else obj[u"offset"]
        return Timestamp(obj[u"value"], tz=obj[u"tz"], freq=freq)
    elif typ == u"nat":
        return NaT
    elif typ == u"period":
        return Period(ordinal=obj[u"ordinal"], freq=obj[u"freq"])
    elif typ == u"index":
        dtype = dtype_for(obj[u"dtype"])
        data = unconvert(obj[u"data"], dtype, obj.get(u"compress"))
        return globals()[obj[u"klass"]](data, dtype=dtype, name=obj[u"name"])
    elif typ == u"range_index":
        return globals()[obj[u"klass"]](obj[u"start"],
                                        obj[u"stop"],
                                        obj[u"step"],
                                        name=obj[u"name"])
    elif typ == u"multi_index":
        dtype = dtype_for(obj[u"dtype"])
        data = unconvert(obj[u"data"], dtype, obj.get(u"compress"))
        data = [tuple(x) for x in data]
        return globals()[obj[u"klass"]].from_tuples(data, names=obj[u"names"])
    elif typ == u"period_index":
        data = unconvert(obj[u"data"], np.int64, obj.get(u"compress"))
        d = dict(name=obj[u"name"], freq=obj[u"freq"])
        if _is_pandas_legacy_version:
            # legacy
            return globals()[obj[u"klass"]](data, **d)
        else:

            freq = d['freq']
            if freq is None:
                raise ValueError(
                    'freq is not specified and cannot be inferred')
            values = [Period(ordinal=x, freq=freq) for x in data]
            return PeriodIndex(values)
            #return globals()[obj[u"klass"]]._from_ordinals(data, **d)
    elif typ == u"datetime_index":
        data = unconvert(obj[u"data"], np.int64, obj.get(u"compress"))
        d = dict(name=obj[u"name"],
                 freq=obj[u"freq"])  #, verify_integrity=False)
        result = globals()[obj[u"klass"]](data, **d)
        tz = obj[u"tz"]

        # reverse tz conversion
        if tz is not None:
            result = result.tz_localize("UTC").tz_convert(tz)
        return result

    elif typ == u"category":
        from_codes = globals()[obj[u"klass"]].from_codes
        return from_codes(codes=obj[u"codes"],
                          categories=obj[u"categories"],
                          ordered=obj[u"ordered"])

    elif typ == u"series":
        dtype = dtype_for(obj[u"dtype"])
        pd_dtype = pandas_dtype(dtype)

        index = obj[u"index"]
        result = globals()[obj[u"klass"]](
            unconvert(obj[u"data"], dtype, obj[u"compress"]),
            index=index,
            dtype=pd_dtype,
            name=obj[u"name"],
        )
        return result

    elif typ == u"block_manager":
        axes = obj[u"axes"]

        def create_block(b):
            values = _safe_reshape(
                unconvert(b[u"values"], dtype_for(b[u"dtype"]),
                          b[u"compress"]),
                b[u"shape"],
            )

            # locs handles duplicate column names, and should be used instead
            # of items; see GH 9618
            if u"locs" in b:
                placement = b[u"locs"]
            else:
                placement = axes[0].get_indexer(b[u"items"])
            klass = getattr(internals, b[u"klass"])
            if klass == DatetimeTZBlock:
                raise ValueError(
                    "Lost the ability to parse datetime with timezone. Sorry")

            return make_block(
                values=values.copy(),
                klass=getattr(internals, b[u"klass"]),
                placement=placement,
                dtype=b[u"dtype"],
            )

        blocks = [create_block(b) for b in obj[u"blocks"]]
        return globals()[obj[u"klass"]](BlockManager(blocks, axes))
    elif typ == u"datetime":
        return parse(obj[u"data"])
    elif typ == u"datetime64":
        return np.datetime64(parse(obj[u"data"]))
    elif typ == u"date":
        return parse(obj[u"data"]).date()
    elif typ == u"timedelta":
        return timedelta(*obj[u"data"])
    elif typ == u"timedelta64":
        return np.timedelta64(int(obj[u"data"]))
    # elif typ == 'sparse_series':
    #    dtype = dtype_for(obj['dtype'])
    #    return globals()[obj['klass']](
    #        unconvert(obj['sp_values'], dtype, obj['compress']),
    #        sparse_index=obj['sp_index'], index=obj['index'],
    #        fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name'])
    # elif typ == 'sparse_dataframe':
    #    return globals()[obj['klass']](
    #        obj['data'], columns=obj['columns'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind']
    #    )
    # elif typ == 'sparse_panel':
    #    return globals()[obj['klass']](
    #        obj['data'], items=obj['items'],
    #        default_fill_value=obj['default_fill_value'],
    #        default_kind=obj['default_kind'])
    elif typ == u"block_index":
        return globals()[obj[u"klass"]](obj[u"length"], obj[u"blocs"],
                                        obj[u"blengths"])
    elif typ == u"int_index":
        return globals()[obj[u"klass"]](obj[u"length"], obj[u"indices"])
    elif typ == u"ndarray":
        return unconvert(obj[u"data"], np.typeDict[obj[u"dtype"]],
                         obj.get(u"compress")).reshape(obj[u"shape"])
    elif typ == u"np_scalar":
        if obj.get(u"sub_typ") == u"np_complex":
            return c2f(obj[u"real"], obj[u"imag"], obj[u"dtype"])
        else:
            dtype = dtype_for(obj[u"dtype"])
            try:
                return dtype(obj[u"data"])
            except:
                return dtype.type(obj[u"data"])
    elif typ == u"np_complex":
        return complex(obj[u"real"] + u"+" + obj[u"imag"] + u"j")
    elif isinstance(obj, (dict, list, set)):
        return obj
    else:
        return obj