Example #1
0
def infer_pandas_schema(df, schema=None):
    schema = schema if schema is not None else {}

    pairs = []
    for column_name, pandas_dtype in df.dtypes.iteritems():
        if not isinstance(column_name, str):
            raise TypeError(
                'Column names must be strings to use the pandas backend'
            )

        if column_name in schema:
            ibis_dtype = dt.dtype(schema[column_name])
        elif pandas_dtype == np.object_:
            inferred_dtype = infer_pandas_dtype(df[column_name], skipna=True)
            if inferred_dtype in {'mixed', 'decimal'}:
                # TODO: in principal we can handle decimal (added in pandas
                # 0.23)
                raise TypeError(
                    'Unable to infer type of column {0!r}. Try instantiating '
                    'your table from the client with client.table('
                    "'my_table', schema={{{0!r}: <explicit type>}})".format(
                        column_name
                    )
                )
            ibis_dtype = _inferable_pandas_dtypes[inferred_dtype]
        else:
            ibis_dtype = dt.dtype(pandas_dtype)

        pairs.append((column_name, ibis_dtype))

    return sch.schema(pairs)
Example #2
0
def schema_from_table(table, schema=None):
    """Retrieve an ibis schema from a SQLAlchemy ``Table``.

    Parameters
    ----------
    table : sa.Table

    Returns
    -------
    schema : ibis.expr.datatypes.Schema
        An ibis schema corresponding to the types of the columns in `table`.
    """
    schema = schema if schema is not None else {}
    pairs = []
    for name, column in table.columns.items():
        if name in schema:
            dtype = dt.dtype(schema[name])
        else:
            dtype = dt.dtype(
                getattr(table.bind, 'dialect', SQLAlchemyDialect()),
                column.type,
                nullable=column.nullable,
            )
        pairs.append((name, dtype))
    return sch.schema(pairs)
Example #3
0
def test_literal_promotions(table, op, name, case, ex_type):
    col = table[name]

    result = op(col, case)
    assert result.type() == dt.dtype(ex_type)

    result = op(case, col)
    assert result.type() == dt.dtype(ex_type)
Example #4
0
def test_interval_unvalid_unit(unit):
    definition = "interval('{}')".format(unit)

    with pytest.raises(ValueError):
        dt.dtype(definition)

    with pytest.raises(ValueError):
        dt.Interval(dt.int32, unit)
Example #5
0
def test_interval(unit):
    definition = "interval('{}')".format(unit)
    dt.Interval(unit, dt.int32) == dt.dtype(definition)

    definition = "interval<uint16>('{}')".format(unit)
    dt.Interval(unit, dt.uint16) == dt.dtype(definition)

    definition = "interval<int64>('{}')".format(unit)
    dt.Interval(unit, dt.int64) == dt.dtype(definition)
Example #6
0
def test_string_to_number(table, type):
    casted = table.g.cast(type)
    casted_literal = ibis.literal('5').cast(type).name('bar')

    assert isinstance(casted, ir.ColumnExpr)
    assert casted.type() == dt.dtype(type)

    assert isinstance(casted_literal, ir.ScalarExpr)
    assert casted_literal.type() == dt.dtype(type)
    assert casted_literal.get_name() == 'bar'
Example #7
0
def param(type):
    """Create a parameter of a particular type to be defined just before
    execution.

    Parameters
    ----------
    type : dt.DataType
        The type of the unbound parameter, e.g., double, int64, date, etc.

    Returns
    -------
    ScalarExpr

    Examples
    --------
    >>> import ibis
    >>> import ibis.expr.datatypes as dt
    >>> start = ibis.param(dt.date)
    >>> end = ibis.param(dt.date)
    >>> schema = [('timestamp_col', 'timestamp'), ('value', 'double')]
    >>> t = ibis.table(schema)
    >>> predicates = [t.timestamp_col >= start, t.timestamp_col <= end]
    >>> expr = t.filter(predicates).value.sum()
    """
    import ibis.expr.datatypes as dt
    import ibis.expr.operations as ops

    return ops.ScalarParameter(dt.dtype(type)).to_expr()
Example #8
0
def sa_array(dialect, satype, nullable=True):
    dimensions = satype.dimensions
    if dimensions is not None and dimensions != 1:
        raise NotImplementedError('Nested array types not yet supported')

    value_dtype = dt.dtype(dialect, satype.item_type)
    return dt.Array(value_dtype, nullable=nullable)
Example #9
0
def test_literal_with_implicit_type(value, expected_type):
    expr = ibis.literal(value)

    assert isinstance(expr, ir.ScalarExpr)
    assert expr.type() == dt.dtype(expected_type)

    assert isinstance(expr.op(), ops.Literal)
    assert expr.op().value is value
Example #10
0
def test_zero_subtract_literal_promotions(
    table, op, left_fn, right_fn, ex_type
):
    # in case of zero subtract the order of operands matters
    left, right = left_fn(table), right_fn(table)
    result = op(left, right)

    assert result.type() == dt.dtype(ex_type)
Example #11
0
def trans_struct(t, context):
    return 'STRUCT<{}>'.format(
        ', '.join(
            '{} {}'.format(
                name, ibis_type_to_bigquery_type(dt.dtype(type), context)
            )
            for name, type in zip(t.names, t.types)
        )
    )
Example #12
0
def infer_parquet_schema(schema):
    pairs = []

    for field in schema.to_arrow_schema():
        ibis_dtype = dt.dtype(field.type, nullable=field.nullable)
        name = field.name
        if not re.match(r'^__index_level_\d+__$', name):
            pairs.append((name, ibis_dtype))

    return sch.schema(pairs)
Example #13
0
def shape_like(arg, dtype=None):
    if isinstance(arg, (tuple, list, ir.ListExpr)):
        datatype = dtype or highest_precedence_dtype(arg)
        columnar = util.any_of(arg, ir.AnyColumn)
    else:
        datatype = dtype or arg.type()
        columnar = isinstance(arg, ir.AnyColumn)

    dtype = dt.dtype(datatype)

    if columnar:
        return dtype.column_type()
    else:
        return dtype.scalar_type()
Example #14
0
File: udf.py Project: cloudera/ibis
def parse_type(t):
    t = t.lower()
    if t in _impala_to_ibis_type:
        return _impala_to_ibis_type[t]
    else:
        if 'varchar' in t or 'char' in t:
            return 'string'
        elif 'decimal' in t:
            result = dt.dtype(t)
            if result:
                return t
            else:
                return ValueError(t)
        else:
            raise Exception(t)
Example #15
0
def value(dtype, arg):
    """Validates that the given argument is a Value with a particular datatype

    Parameters
    ----------
    dtype : DataType subclass or DataType instance
    arg : python literal or an ibis expression
      If a python literal is given the validator tries to coerce it to an ibis
      literal.

    Returns
    -------
    arg : AnyValue
      An ibis value expression with the specified datatype
    """
    if not isinstance(arg, ir.Expr):
        # coerce python literal to ibis literal
        arg = ir.literal(arg)

    if not isinstance(arg, ir.AnyValue):
        raise com.IbisTypeError(
            'Given argument with type {} is not a value '
            'expression'.format(type(arg))
        )

    # retrieve literal values for implicit cast check
    value = getattr(arg.op(), 'value', None)

    if isinstance(dtype, type) and isinstance(arg.type(), dtype):
        # dtype class has been specified like dt.Interval or dt.Decimal
        return arg
    elif dt.castable(arg.type(), dt.dtype(dtype), value=value):
        # dtype instance or string has been specified and arg's dtype is
        # implicitly castable to it, like dt.int8 is castable to dt.int64
        return arg
    else:
        raise com.IbisTypeError(
            'Given argument with datatype {} is not '
            'subtype of {} nor implicitly castable to '
            'it'.format(arg.type(), dtype)
        )
Example #16
0
def test_struct():
    orders = """array<struct<
                    oid: int64,
                    status: string,
                    totalprice: decimal(12, 2),
                    order_date: string,
                    items: array<struct<
                        iid: int64,
                        name: string,
                        price: decimal(12, 2),
                        discount_perc: decimal(12, 2),
                        shipdate: string
                    >>
                >>"""
    expected = dt.Array(
        dt.Struct.from_tuples(
            [
                ('oid', dt.int64),
                ('status', dt.string),
                ('totalprice', dt.Decimal(12, 2)),
                ('order_date', dt.string),
                (
                    'items',
                    dt.Array(
                        dt.Struct.from_tuples(
                            [
                                ('iid', dt.int64),
                                ('name', dt.string),
                                ('price', dt.Decimal(12, 2)),
                                ('discount_perc', dt.Decimal(12, 2)),
                                ('shipdate', dt.string),
                            ]
                        )
                    ),
                ),
            ]
        )
    )

    assert dt.dtype(orders) == expected
Example #17
0
def test_char_varchar_invalid(spec):
    with pytest.raises(IbisTypeError):
        dt.dtype(spec)
Example #18
0
def literal(value, type=None):
    """Create a scalar expression from a Python value.

    Parameters
    ----------
    value : some Python basic type
        A Python value
    type : ibis type or string, optional
        An instance of :class:`ibis.expr.datatypes.DataType` or a string
        indicating the ibis type of `value`. This parameter should only be used
        in cases where ibis's type inference isn't sufficient for discovering
        the type of `value`.

    Returns
    -------
    literal_value : Literal
        An expression representing a literal value

    Examples
    --------
    >>> import ibis
    >>> x = ibis.literal(42)
    >>> x.type()
    int8
    >>> y = ibis.literal(42, type='double')
    >>> y.type()
    float64
    >>> ibis.literal('foobar', type='int64')  # doctest: +ELLIPSIS
    Traceback (most recent call last):
      ...
    TypeError: Value 'foobar' cannot be safely coerced to int64
    """
    import ibis.expr.datatypes as dt
    import ibis.expr.operations as ops

    if hasattr(value, 'op') and isinstance(value.op(), ops.Literal):
        return value

    try:
        inferred_dtype = dt.infer(value)
    except com.InputTypeError:
        has_inferred = False
    else:
        has_inferred = True

    if type is None:
        has_explicit = False
    else:
        has_explicit = True
        explicit_dtype = dt.dtype(type)

    if has_explicit and has_inferred:
        try:
            # ensure type correctness: check that the inferred dtype is
            # implicitly castable to the explicitly given dtype and value
            dtype = inferred_dtype.cast(explicit_dtype, value=value)
        except com.IbisTypeError:
            raise TypeError(
                'Value {!r} cannot be safely coerced to {}'.format(value, type)
            )
    elif has_explicit:
        dtype = explicit_dtype
    elif has_inferred:
        dtype = inferred_dtype
    else:
        raise TypeError(
            'The datatype of value {!r} cannot be inferred, try '
            'passing it explicitly with the `type` keyword.'.format(value)
        )

    if dtype is dt.null:
        return null().cast(dtype)
    else:
        return ops.Literal(value, dtype=dtype).to_expr()
Example #19
0
def test_decimal_failure(case):
    with pytest.raises(IbisTypeError):
        dt.dtype(case)
Example #20
0
    def _grouped(input_type, output_type, base_class, output_type_method):
        """Define a user-defined function that is applied per group.

        Parameters
        ----------
        input_type : List[ibis.expr.datatypes.DataType]
            A list of the types found in :mod:`~ibis.expr.datatypes`. The
            length of this list must match the number of arguments to the
            function. Variadic arguments are not yet supported.
        output_type : ibis.expr.datatypes.DataType
            The return type of the function.
        base_class : Type[T]
            The base class of the generated Node
        output_type_method : Callable
            A callable that determines the method to call to get the expression
            type of the UDF

        See Also
        --------
        ibis.pandas.udf.reduction
        ibis.pandas.udf.analytic
        """
        input_type = list(map(dt.dtype, input_type))
        output_type = dt.dtype(output_type)

        def wrapper(func):
            funcsig = valid_function_signature(input_type, func)

            UDAFNode = type(
                func.__name__,
                (base_class, ),
                {
                    'signature': sig.TypeSignature.from_dtypes(input_type),
                    'output_type': output_type_method(output_type),
                },
            )

            # An execution rule for a simple aggregate node
            @execute_node.register(UDAFNode,
                                   *udf_signature(input_type,
                                                  pin=None,
                                                  klass=pd.Series))
            def execute_udaf_node(op, *args, **kwargs):
                args, kwargs = arguments_from_signature(
                    funcsig, *args, **kwargs)
                return func(*args, **kwargs)

            # An execution rule for a grouped aggregation node. This
            # includes aggregates applied over a window.
            nargs = len(input_type)
            group_by_signatures = [
                udf_signature(input_type, pin=pin, klass=SeriesGroupBy)
                for pin in range(nargs)
            ]

            @toolz.compose(*(execute_node.register(UDAFNode, *types)
                             for types in group_by_signatures))
            def execute_udaf_node_groupby(op, *args, **kwargs):
                # construct a generator that yields the next group of data
                # for every argument excluding the first (pandas performs
                # the iteration for the first argument) for each argument
                # that is a SeriesGroupBy.
                #
                # If the argument is not a SeriesGroupBy then keep
                # repeating it until all groups are exhausted.
                aggcontext = kwargs.pop('aggcontext', None)
                assert aggcontext is not None, 'aggcontext is None'

                if isinstance(aggcontext, Window):
                    # Call the func differently for Window because of
                    # the custom rolling logic.
                    result = aggcontext.agg(args[0], func, *args, **kwargs)
                else:
                    iters = create_gens_from_args_groupby(args[1:])
                    funcsig = signature(func)

                    # TODO: Unify calling convension here to be more like
                    # window
                    def aggregator(first, *rest, **kwargs):
                        # map(next, *rest) gets the inputs for the next group
                        # TODO: might be inefficient to do this on every call
                        args, kwargs = arguments_from_signature(
                            funcsig, first, *map(next, rest), **kwargs)
                        return func(*args, **kwargs)

                    result = aggcontext.agg(args[0], aggregator, *iters,
                                            **kwargs)

                return result

            @functools.wraps(func)
            def wrapped(*args):
                return UDAFNode(*args).to_expr()

            return wrapped

        return wrapper
Example #21
0
def test_pandas_dtype(pandas_dtype, ibis_dtype):
    assert dt.dtype(pandas_dtype) == ibis_dtype
Example #22
0
def test_token_error():
    with pytest.raises(IbisTypeError):
        dt.dtype('array<string>>')
Example #23
0
def test_map_does_not_allow_non_primitive_keys():
    with pytest.raises(IbisTypeError):
        dt.dtype('map<array<string>, double>')
Example #24
0
def test_array():
    assert dt.dtype('ARRAY<DOUBLE>') == dt.Array(dt.double)
Example #25
0
def test_timestamp_with_timezone_parser_double_quote():
    t = dt.dtype("timestamp('US/Eastern')")
    assert isinstance(t, dt.Timestamp)
    assert t.timezone == 'US/Eastern'
Example #26
0
def test_char_varchar_invalid(spec):
    with pytest.raises(SyntaxError):
        dt.dtype(spec)
Example #27
0
def test_primitive(spec, expected):
    assert dt.dtype(spec) == expected
Example #28
0
def test_char_varchar(spec):
    assert dt.dtype(spec) == dt.string
Example #29
0
        (operator.pow, 'b', 1.5, 'double'),
        (operator.pow, 'c', 1.5, 'double'),
        (operator.pow, 'd', 1.5, 'double'),
        (operator.pow, 'e', 2, 'float'),
        (operator.pow, 'f', 2, 'double'),
        (operator.pow, 'a', -2, 'double'),
        (operator.pow, 'b', -2, 'double'),
        (operator.pow, 'c', -2, 'double'),
        (operator.pow, 'd', -2, 'double'),
    ],
    ids=lambda arg: str(getattr(arg, '__name__', arg)))
def test_literal_promotions(table, op, name, case, ex_type):
    col = table[name]

    result = op(col, case)
    assert result.type() == dt.dtype(ex_type)

    result = op(case, col)
    assert result.type() == dt.dtype(ex_type)


@pytest.mark.parametrize(('op', 'left_fn', 'right_fn', 'ex_type'), [
    (operator.sub, lambda t: t['a'], lambda t: 0, 'int8'),
    (operator.sub, lambda t: 0, lambda t: t['a'], 'int16'),
    (operator.sub, lambda t: t['b'], lambda t: 0, 'int16'),
    (operator.sub, lambda t: 0, lambda t: t['b'], 'int32'),
    (operator.sub, lambda t: t['c'], lambda t: 0, 'int32'),
    (operator.sub, lambda t: 0, lambda t: t['c'], 'int64'),
],
                         ids=lambda arg: str(getattr(arg, '__name__', arg)))
def test_zero_subtract_literal_promotions(table, op, left_fn, right_fn,
Example #30
0
def spark_dataframe_schema(df):
    """Infer the schema of a Spark SQL `DataFrame` object."""
    # df.schema is a pt.StructType
    schema_struct = dt.dtype(df.schema)

    return sch.schema(schema_struct.names, schema_struct.types)
Example #31
0
def test_timestamp_with_timezone_parser_double_quote():
    t = dt.dtype("timestamp('US/Eastern')")
    assert isinstance(t, dt.Timestamp)
    assert t.timezone == 'US/Eastern'
Example #32
0
def test_timestamp_with_timezone_parser_invalid_timezone():
    ts = dt.dtype("timestamp('US/Ea')")
    assert str(ts) == "timestamp('US/Ea')"
Example #33
0
def test_empty_complex_type():
    with pytest.raises(IbisTypeError):
        dt.dtype('map<>')
Example #34
0
def test_nested_array():
    assert dt.dtype('array<array<string>>') == dt.Array(dt.Array(dt.string))
Example #35
0
def test_time_valid():
    assert dt.dtype('time').equals(dt.time)
Example #36
0
def test_interval_invalid_type():
    with pytest.raises(TypeError):
        dt.Interval('m', dt.float32)

    with pytest.raises(TypeError):
        dt.dtype("interval<float>('s')")
Example #37
0
def test_empty_complex_type():
    with pytest.raises(IbisTypeError):
        dt.dtype('map<>')
Example #38
0
def test_time_valid():
    assert dt.dtype('time').equals(dt.time)
Example #39
0
def literal(value: Any, type: dt.DataType | str | None = None) -> ScalarExpr:
    """Create a scalar expression from a Python value.

    !!! tip "Use specific functions for arrays, structs and maps"

        Ibis supports literal construction of arrays using the following
        functions:

        1. [`ibis.array`][ibis.array]
        1. [`ibis.struct`][ibis.struct]
        1. [`ibis.map`][ibis.map]

        Constructing these types using `literal` will be deprecated in a future
        release.

    Parameters
    ----------
    value
        A Python value
    type
        An instance of [`DataType`][ibis.expr.datatypes.DataType] or a string
        indicating the ibis type of `value`. This parameter can be used
        in cases where ibis's type inference isn't sufficient for discovering
        the type of `value`.

    Returns
    -------
    ScalarExpr
        An expression representing a literal value

    Examples
    --------
    Construct an integer literal

    >>> import ibis
    >>> x = ibis.literal(42)
    >>> x.type()
    Int8(nullable=True)

    Construct a `float64` literal from an `int`

    >>> y = ibis.literal(42, type='double')
    >>> y.type()
    Float64(nullable=True)

    Ibis checks for invalid types

    >>> ibis.literal('foobar', type='int64')  # doctest: +ELLIPSIS
    Traceback (most recent call last):
      ...
    TypeError: Value 'foobar' cannot be safely coerced to int64
    """
    import ibis.expr.datatypes as dt
    import ibis.expr.operations as ops

    if hasattr(value, 'op') and isinstance(value.op(), ops.Literal):
        return value

    try:
        inferred_dtype = dt.infer(value)
    except com.InputTypeError:
        has_inferred = False
    else:
        has_inferred = True

    if type is None:
        has_explicit = False
    else:
        has_explicit = True
        explicit_dtype = dt.dtype(type)

    if has_explicit and has_inferred:
        try:
            # ensure type correctness: check that the inferred dtype is
            # implicitly castable to the explicitly given dtype and value
            dtype = inferred_dtype.cast(explicit_dtype, value=value)
        except com.IbisTypeError:
            raise TypeError(
                f'Value {value!r} cannot be safely coerced to {type}'
            )
    elif has_explicit:
        dtype = explicit_dtype
    elif has_inferred:
        dtype = inferred_dtype
    else:
        raise TypeError(
            'The datatype of value {!r} cannot be inferred, try '
            'passing it explicitly with the `type` keyword.'.format(value)
        )

    if dtype is dt.null:
        return null().cast(dtype)
    else:
        value = dt._normalize(dtype, value)
        return ops.Literal(value, dtype=dtype).to_expr()
Example #40
0
def test_map():
    assert dt.dtype('map<string, double>') == dt.Map(dt.string, dt.double)
Example #41
0
def test_numpy_dtype(numpy_dtype, ibis_dtype):
    assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype
Example #42
0
def test_char_varchar_invalid(spec):
    with pytest.raises(IbisTypeError):
        dt.dtype(spec)
Example #43
0
 def __init__(self, input_type, output_type):
     self.input_type = list(map(dt.dtype, input_type))
     self.output_type = dt.dtype(output_type)
     self.spark_output_type = spark_dtype(self.output_type)
Example #44
0
def test_map_does_not_allow_non_primitive_keys():
    with pytest.raises(SyntaxError):
        dt.dtype('map<array<string>, double>')
Example #45
0
def test_char_varchar(spec):
    assert dt.dtype(spec) == dt.string
def test_dtype(spec, expected):
    assert dt.dtype(spec) == expected
Example #47
0
def test_primitive_from_string(spec, expected):
    assert dt.dtype(spec) == expected
Example #48
0
File: client.py Project: shshe/ibis
def infer_numpy_scalar(value):
    return dt.dtype(value.dtype)
Example #49
0
def test_timestamp_with_timezone_parser_invalid_timezone():
    ts = dt.dtype("timestamp('US/Ea')")
    assert str(ts) == "timestamp('US/Ea')"
Example #50
0
File: client.py Project: shshe/ibis
def infer_array(value):
    # TODO(kszucs): infer series
    return dt.Array(dt.dtype(value.dtype.name))
Example #51
0
def test_interval_invalid_type():
    with pytest.raises(TypeError):
        dt.Interval('m', dt.float32)

    with pytest.raises(TypeError):
        dt.dtype("interval<float>('s')")
Example #52
0
def test_empty_complex_type():
    with pytest.raises(parsy.ParseError):
        dt.dtype('map<>')
Example #53
0
def test_string_argument_parsing_failure_mode(case):
    with pytest.raises(IbisTypeError):
        dt.dtype(case)
Example #54
0
def test_token_error():
    with pytest.raises(parsy.ParseError):
        dt.dtype('array<string>>')
Example #55
0
def test_dtype(spec, expected):
    assert dt.dtype(spec) == expected
Example #56
0
def test_primitive_from_string(spec, expected):
    assert dt.dtype(spec) == expected
Example #57
0
def test_token_error():
    with pytest.raises(IbisTypeError):
        dt.dtype('array<string>>')
Example #58
0
def test_char_varchar_invalid(spec):
    with pytest.raises(parsy.ParseError):
        dt.dtype(spec)
Example #59
0
def test_nested_map():
    expected = dt.Map(dt.int64, dt.Array(dt.Map(dt.string, dt.int8)))
    assert dt.dtype('map<int64, array<map<string, int8>>>') == expected
Example #60
0
def datatype(arg):
    return dt.dtype(arg)