Beispiel #1
0
class DataType(Annotable, Comparable):
    """Base class for all data types.

    [`DataType`][ibis.expr.datatypes.DataType] instances are
    immutable.
    """

    nullable = optional(instance_of(bool), default=True)

    def __call__(self, nullable: bool = True) -> DataType:
        if nullable is not True and nullable is not False:
            raise TypeError(
                "__call__ only accepts the 'nullable' argument. "
                "Please construct a new instance of the type to change the "
                "values of the attributes."
            )
        kwargs = dict(zip(self.argnames, self.args))
        kwargs["nullable"] = nullable
        return self.__class__(**kwargs)

    @property
    def _pretty_piece(self) -> str:
        return ""

    @property
    def name(self) -> str:
        """Return the name of the data type."""
        return self.__class__.__name__

    def __str__(self) -> str:
        prefix = "!" * (not self.nullable)
        return f"{prefix}{self.name.lower()}{self._pretty_piece}"

    def __equals__(
        self,
        other: typing.Any,
    ) -> bool:
        return self.args == other.args

    def equals(self, other):
        if not isinstance(other, DataType):
            raise TypeError(
                "invalid equality comparison between DataType and "
                f"{type(other)}"
            )
        return super().__cached_equals__(other)

    def castable(self, target, **kwargs):
        """Return whether this data type is castable to `target`."""
        return castable(self, target, **kwargs)

    def cast(self, target, **kwargs):
        """Cast this data type to `target`."""
        return cast(self, target, **kwargs)
Beispiel #2
0
class Timestamp(DataType):
    """Timestamp values."""

    timezone = optional(instance_of(str))
    """The timezone of values of this type."""

    scalar = ir.TimestampScalar
    column = ir.TimestampColumn

    @property
    def _pretty_piece(self) -> str:
        if (timezone := self.timezone) is not None:
            return f"({timezone!r})"
        return ""
Beispiel #3
0
class Category(DataType):
    cardinality = optional(instance_of(int))

    scalar = ir.CategoryScalar
    column = ir.CategoryColumn

    def __repr__(self):
        if self.cardinality is not None:
            cardinality = repr(self.cardinality)
        else:
            cardinality = "unknown"
        return f"{self.name}(cardinality={cardinality})"

    def to_integer_type(self):
        if self.cardinality is None:
            return int64
        else:
            return infer(self.cardinality)
Beispiel #4
0
class GeoSpatial(DataType):
    """Geospatial values."""

    geotype = optional(isin({"geography", "geometry"}))
    """The specific geospatial type"""

    srid = optional(instance_of(int))
    """The spatial reference identifier."""

    column = ir.GeoSpatialColumn
    scalar = ir.GeoSpatialScalar

    @property
    def _pretty_piece(self) -> str:
        piece = ""
        if self.geotype is not None:
            piece += f":{self.geotype}"
        if self.srid is not None:
            piece += f";{self.srid}"
        return piece
Beispiel #5
0
class Schema(Annotable, Comparable):
    """An object for holding table schema information, i.e., column names and
    types.

    Parameters
    ----------
    names : Sequence[str]
        A sequence of ``str`` indicating the name of each column.
    types : Sequence[DataType]
        A sequence of :class:`ibis.expr.datatypes.DataType` objects
        representing type of each column.
    """

    __slots__ = ('_name_locs', )

    names: Sequence[str] = tuple_of(instance_of((str, UnnamedMarker)))
    types: Sequence[dt.DataType] = tuple_of(datatype)

    @immutable_property
    def _name_locs(self):
        # validate unique field names
        name_locs = {v: i for i, v in enumerate(self.names)}
        if len(name_locs) < len(self.names):
            duplicate_names = list(self.names)
            for v in name_locs.keys():
                duplicate_names.remove(v)
            raise IntegrityError(
                f'Duplicate column name(s): {duplicate_names}')
        return name_locs

    def __repr__(self):
        space = 2 + max(map(len, self.names), default=0)
        return "ibis.Schema {{{}\n}}".format(
            indent(
                ''.join(f'\n{name.ljust(space)}{str(type)}'
                        for name, type in zip(self.names, self.types)),
                2,
            ))

    def __len__(self):
        return len(self.names)

    def __iter__(self):
        return iter(self.names)

    def __contains__(self, name):
        return name in self._name_locs

    def __getitem__(self, name):
        return self.types[self._name_locs[name]]

    def __equals__(self, other):
        return (self._hash == other._hash and self.names == other.names
                and self.types == other.types)

    def equals(self, other):
        if not isinstance(other, Schema):
            raise TypeError("invalid equality comparison between Schema and "
                            f"{type(other)}")
        return self.__cached_equals__(other)

    def delete(self, names_to_delete):
        for name in names_to_delete:
            if name not in self:
                raise KeyError(name)

        new_names, new_types = [], []
        for name, type_ in zip(self.names, self.types):
            if name in names_to_delete:
                continue
            new_names.append(name)
            new_types.append(type_)

        return Schema(new_names, new_types)

    @classmethod
    def from_tuples(cls, values):
        if not isinstance(values, (list, tuple)):
            values = list(values)

        names, types = zip(*values) if values else ([], [])
        return Schema(names, types)

    @classmethod
    def from_dict(cls, dictionary):
        names, types = zip(*dictionary.items()) if dictionary else ([], [])
        return Schema(names, types)

    def __gt__(self, other):
        return set(self.items()) > set(other.items())

    def __ge__(self, other):
        return set(self.items()) >= set(other.items())

    def append(self, schema):
        return Schema(self.names + schema.names, self.types + schema.types)

    def items(self):
        return zip(self.names, self.types)

    def name_at_position(self, i):
        """ """
        upper = len(self.names) - 1
        if not 0 <= i <= upper:
            raise ValueError(
                'Column index must be between 0 and {:d}, inclusive'.format(
                    upper))
        return self.names[i]

    def apply_to(self, df):
        """Applies the Ibis schema to a pandas DataFrame

        Parameters
        ----------
        df : pandas.DataFrame

        Returns
        -------
        df : pandas.DataFrame

        Notes
        -----
        Mutates `df`
        """
        schema_names = self.names
        data_columns = df.columns

        assert len(schema_names) == len(
            data_columns
        ), "schema column count does not match input data column count"

        for column, dtype in zip(data_columns, self.types):
            pandas_dtype = dtype.to_pandas()

            col = df[column]
            col_dtype = col.dtype

            try:
                not_equal = pandas_dtype != col_dtype
            except TypeError:
                # ugh, we can't compare dtypes coming from pandas,
                # assume not equal
                not_equal = True

            if not_equal or isinstance(dtype, (dt.String, dt.Struct)):
                new_col = convert(col_dtype, dtype, col)
            else:
                new_col = col
            df[column] = new_col

        # return data with the schema's columns which may be different than the
        # input columns
        df.columns = schema_names
        return df
Beispiel #6
0
def client(arg, **kwargs):
    from ibis.backends.base import BaseBackend

    return instance_of(BaseBackend, arg)
Beispiel #7
0
def column(inner, arg, **kwargs):
    return instance_of(ir.Column, inner(arg, **kwargs))
Beispiel #8
0
def scalar(inner, arg, **kwargs):
    return instance_of(ir.Scalar, inner(arg, **kwargs))
Beispiel #9
0
class Struct(DataType):
    """Structured values."""

    names = tuple_of(instance_of(str))
    types = tuple_of(datatype)

    scalar = ir.StructScalar
    column = ir.StructColumn

    @classmethod
    def from_tuples(
        cls,
        pairs: Iterable[tuple[str, str | DataType]],
        nullable: bool = True,
    ) -> Struct:
        """Construct a `Struct` type from pairs.

        Parameters
        ----------
        pairs
            An iterable of pairs of field name and type

        Returns
        -------
        Struct
            Struct data type instance
        """
        names, types = zip(*pairs)
        return cls(names, types, nullable=nullable)

    @classmethod
    def from_dict(
        cls,
        pairs: Mapping[str, str | DataType],
        nullable: bool = True,
    ) -> Struct:
        """Construct a `Struct` type from a [`dict`][dict].

        Parameters
        ----------
        pairs
            A [`dict`][dict] of `field: type`

        Returns
        -------
        Struct
            Struct data type instance
        """
        names, types = pairs.keys(), pairs.values()
        return cls(names, types, nullable=nullable)

    @property
    def pairs(self) -> Mapping[str, DataType]:
        """Return a mapping from names to data type instances.

        Returns
        -------
        Mapping[str, DataType]
            Mapping of field name to data type
        """
        return dict(zip(self.names, self.types))

    def __getitem__(self, key: str) -> DataType:
        return self.pairs[key]

    def __repr__(self) -> str:
        return '{}({}, nullable={})'.format(
            self.name, list(self.pairs.items()), self.nullable
        )

    @property
    def _pretty_piece(self) -> str:
        pairs = ", ".join(map("{}: {}".format, self.names, self.types))
        return f"<{pairs}>"
Beispiel #10
0
class Interval(DataType):
    """Interval values."""

    unit = optional(
        map_to(
            {
                'days': 'D',
                'hours': 'h',
                'minutes': 'm',
                'seconds': 's',
                'milliseconds': 'ms',
                'microseconds': 'us',
                'nanoseconds': 'ns',
                'Y': 'Y',
                'Q': 'Q',
                'M': 'M',
                'W': 'W',
                'D': 'D',
                'h': 'h',
                'm': 'm',
                's': 's',
                'ms': 'ms',
                'us': 'us',
                'ns': 'ns',
            }
        ),
        default="s",
    )
    """The time unit of the interval."""

    value_type = optional(
        compose_of([datatype, instance_of(Integer)]), default=Int32()
    )
    """The underlying type of the stored values."""

    scalar = ir.IntervalScalar
    column = ir.IntervalColumn

    # based on numpy's units
    _units = {
        'Y': 'year',
        'Q': 'quarter',
        'M': 'month',
        'W': 'week',
        'D': 'day',
        'h': 'hour',
        'm': 'minute',
        's': 'second',
        'ms': 'millisecond',
        'us': 'microsecond',
        'ns': 'nanosecond',
    }

    # TODO(kszucs): assert that the nullability if the value_type is equal
    # to the interval's nullability

    @property
    def bounds(self):
        return self.value_type.bounds

    @property
    def resolution(self):
        """The interval unit's name."""
        return self._units[self.unit]

    @property
    def _pretty_piece(self) -> str:
        return f"<{self.value_type}>(unit={self.unit!r})"
Beispiel #11
0
class Decimal(DataType):
    """Fixed-precision decimal values."""

    precision = optional(instance_of(int))
    """The number of decimal places values of this type can hold."""

    scale = optional(instance_of(int))
    """The number of values after the decimal point."""

    scalar = ir.DecimalScalar
    column = ir.DecimalColumn

    def __init__(
        self,
        precision: int | None = None,
        scale: int | None = None,
        **kwargs: Any,
    ) -> None:
        if precision is not None:
            if not isinstance(precision, numbers.Integral):
                raise TypeError(
                    "Decimal type precision must be an integer; "
                    f"got {type(precision)}"
                )
            if precision < 0:
                raise ValueError('Decimal type precision cannot be negative')
            if not precision:
                raise ValueError('Decimal type precision cannot be zero')
        if scale is not None:
            if not isinstance(scale, numbers.Integral):
                raise TypeError('Decimal type scale must be an integer')
            if scale < 0:
                raise ValueError('Decimal type scale cannot be negative')
            if precision is not None and precision < scale:
                raise ValueError(
                    'Decimal type precision must be greater than or equal to '
                    'scale. Got precision={:d} and scale={:d}'.format(
                        precision, scale
                    )
                )
        super().__init__(precision=precision, scale=scale, **kwargs)

    @property
    def largest(self):
        """Return the largest type of decimal."""
        return self.__class__(
            precision=max(self.precision, 38)
            if self.precision is not None
            else None,
            scale=max(self.scale, 2) if self.scale is not None else None,
        )

    @property
    def _pretty_piece(self) -> str:
        args = []

        if (precision := self.precision) is not None:
            args.append(f"prec={precision:d}")

        if (scale := self.scale) is not None:
            args.append(f"scale={scale:d}")