Beispiel #1
0
def _get_common_dtype(lhs_dtype, rhs_dtype):
    if lhs_dtype == rhs_dtype:
        return lhs_dtype
    if is_float_dtype(lhs_dtype) or is_float_dtype(rhs_dtype):
        return get_dtype(float)
    assert is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype)
    return get_dtype(int)
Beispiel #2
0
    def _get_bin_op_res_type(self, op_name, lhs_dtype, rhs_dtype):
        """
        Return the result data type for a binary operation.

        Parameters
        ----------
        op_name : str
            A binary operation name.
        lhs_dtype : dtype
            A left operand's type.
        rhs_dtype : dtype
            A right operand's type.

        Returns
        -------
        dtype
        """
        if op_name in self.preserve_dtype_math_ops:
            return _get_common_dtype(lhs_dtype, rhs_dtype)
        elif op_name in self.promote_to_float_math_ops:
            return get_dtype(float)
        elif is_cmp_op(op_name):
            return get_dtype(bool)
        else:
            raise NotImplementedError(
                f"unsupported binary operation {op_name}")
Beispiel #3
0
def _get_common_dtype(lhs_dtype, rhs_dtype):
    """
    Get data type for a binary operation result.

    Parameters
    ----------
    lhs_dtype : dtype
        The type of the first operand.
    rhs_dtype : dtype
        The type of the second operand.

    Returns
    -------
    dtype
        The result data type.
    """
    if lhs_dtype == rhs_dtype:
        return lhs_dtype
    if is_float_dtype(lhs_dtype) and (
        is_float_dtype(rhs_dtype) or is_integer_dtype(rhs_dtype)
    ):
        return get_dtype(float)
    if is_float_dtype(rhs_dtype) and (
        is_float_dtype(lhs_dtype) or is_integer_dtype(lhs_dtype)
    ):
        return get_dtype(float)
    if is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype):
        return get_dtype(int)
    raise TypeError(f"Cannot perform operation on types: {lhs_dtype}, {rhs_dtype}")
Beispiel #4
0
def _agg_dtype(agg, dtype):
    if agg in _aggs_preserving_numeric_type:
        return dtype
    elif agg in _aggs_with_int_result:
        return get_dtype(int)
    elif agg in _aggs_with_float_result:
        return get_dtype(float)
    else:
        raise NotImplementedError(f"unsupported aggreagte {agg}")
Beispiel #5
0
 def __init__(self, val):
     assert val is None or isinstance(
         val, (int, float, bool, str, np.int8, np.int16, np.int32, np.int64)
     ), f"unsupported literal value {val} of type {type(val)}"
     self.val = val
     if val is None:
         self._dtype = get_dtype(float)
     else:
         self._dtype = get_dtype(type(val))
Beispiel #6
0
 def _get_bin_op_res_type(self, op_name, lhs_dtype, rhs_dtype):
     if op_name in self.preserve_dtype_math_ops:
         return _get_common_dtype(lhs_dtype, rhs_dtype)
     elif op_name in self.promote_to_float_math_ops:
         return get_dtype(float)
     elif is_cmp_op(op_name):
         return get_dtype(bool)
     else:
         raise NotImplementedError(f"unsupported binary operation {op_name}")
Beispiel #7
0
def build_row_idx_filter_expr(row_idx, row_col):
    """
    Build an expression to filter rows by rowid.

    Parameters
    ----------
    row_idx : int or list of int
        The row numeric indices to select.
    row_col : InputRefExpr
        The rowid column reference expression.

    Returns
    -------
    BaseExpr
        The resulting filtering expression.
    """
    if not is_list_like(row_idx):
        return row_col.eq(row_idx)

    exprs = []
    for idx in row_idx:
        exprs.append(row_col.eq(idx))

    res = OpExpr("OR", exprs, get_dtype(bool))

    return res
Beispiel #8
0
        def gen_reduce_expr(self):
            """
            Generate an expression for a compound aggregate.

            Returns
            -------
            BaseExpr
                A final compound aggregate expression.
            """
            count_expr = self._builder._ref(self._arg.modin_frame,
                                            self._count_name)
            count_expr._dtype = get_dtype(int)
            sum_expr = self._builder._ref(self._arg.modin_frame,
                                          self._sum_name)
            sum_expr._dtype = self._sum_dtype
            qsum_expr = self._builder._ref(self._arg.modin_frame,
                                           self._quad_sum_name)
            qsum_expr._dtype = self._sum_dtype

            null_expr = LiteralExpr(None)
            count_or_null = build_if_then_else(count_expr.eq(LiteralExpr(0)),
                                               null_expr, count_expr,
                                               count_expr._dtype)
            count_m_1_or_null = build_if_then_else(
                count_expr.eq(LiteralExpr(1)),
                null_expr,
                count_expr.sub(LiteralExpr(1)),
                count_expr._dtype,
            )

            # sqrt((sum(x * x) - sum(x) * sum(x) / n) / (n - 1))
            return (qsum_expr.sub(
                sum_expr.mul(sum_expr).truediv(count_or_null)).truediv(
                    count_m_1_or_null).pow(LiteralExpr(0.5)))
Beispiel #9
0
        def gen_reduce_expr(self):
            count_expr = self._builder._ref(self._arg.modin_frame, self._count_name)
            count_expr._dtype = get_dtype(int)
            sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name)
            sum_expr._dtype = self._sum_dtype
            qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name)
            qsum_expr._dtype = self._sum_dtype
            csum_expr = self._builder._ref(self._arg.modin_frame, self._cube_sum_name)
            csum_expr._dtype = self._sum_dtype

            mean_expr = sum_expr.truediv(count_expr)

            # n * sqrt(n - 1) / (n - 2)
            #  * (sum(x ** 3) - 3 * mean * sum(x * x) + 2 * mean * mean * sum(x))
            #  / (sum(x * x) - mean * sum(x)) ** 1.5
            part1 = count_expr.mul(
                count_expr.sub(LiteralExpr(1)).pow(LiteralExpr(0.5))
            ).truediv(count_expr.sub(LiteralExpr(2)))
            part2 = csum_expr.sub(mean_expr.mul(qsum_expr).mul(LiteralExpr(3.0))).add(
                mean_expr.mul(mean_expr).mul(sum_expr).mul(LiteralExpr(2.0))
            )
            part3 = qsum_expr.sub(mean_expr.mul(sum_expr)).pow(LiteralExpr(1.5))
            skew_expr = part1.mul(part2).truediv(part3)

            # The result is NULL if n <= 2
            return build_if_then_else(
                count_expr.le(LiteralExpr(2)),
                LiteralExpr(None),
                skew_expr,
                skew_expr._dtype,
            )
Beispiel #10
0
        def gen_reduce_expr(self):
            count_expr = self._builder._ref(self._arg.modin_frame, self._count_name)
            count_expr._dtype = get_dtype(int)
            sum_expr = self._builder._ref(self._arg.modin_frame, self._sum_name)
            sum_expr._dtype = self._sum_dtype
            qsum_expr = self._builder._ref(self._arg.modin_frame, self._quad_sum_name)
            qsum_expr._dtype = self._sum_dtype

            null_expr = LiteralExpr(None)
            count_or_null = build_if_then_else(
                count_expr.eq(LiteralExpr(0)), null_expr, count_expr, count_expr._dtype
            )
            count_m_1_or_null = build_if_then_else(
                count_expr.eq(LiteralExpr(1)),
                null_expr,
                count_expr.sub(LiteralExpr(1)),
                count_expr._dtype,
            )

            # sqrt((sum(x * x) - sum(x) * sum(x) / n) / (n - 1))
            return (
                qsum_expr.sub(sum_expr.mul(sum_expr).truediv(count_or_null))
                .truediv(count_m_1_or_null)
                .pow(LiteralExpr(0.5))
            )
Beispiel #11
0
    def bin_op(self, other, op_name):
        """
        Build a binary operation expression.

        Parameters
        ----------
        other : BaseExpr
            The second operand.
        op_name : str
            A binary operation name.

        Returns
        -------
        BaseExpr
            The resulting binary operation expression.
        """
        if op_name not in self.binary_operations:
            raise NotImplementedError(f"unsupported binary operation {op_name}")

        if is_cmp_op(op_name):
            return self._cmp_op(other, op_name)

        # True division may require prior cast to float to avoid integer division
        if op_name == "truediv":
            if is_integer_dtype(self._dtype) and is_integer_dtype(other._dtype):
                other = other.cast(get_dtype(float))
        res_type = self._get_bin_op_res_type(op_name, self._dtype, other._dtype)
        new_expr = OpExpr(self.binary_operations[op_name], [self, other], res_type)
        # Floor division may require additional FLOOR expr.
        if op_name == "floordiv" and not is_integer_dtype(res_type):
            return new_expr.floor()
        return new_expr
Beispiel #12
0
    def dtype(self):
        if self.block is None:
            raise AssertionError("Block is None, no dtype")

        if not self.needs_filling:
            return self.block.dtype
        else:
            return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0])
Beispiel #13
0
    def _process_join(self, op):
        left = op.input[0]
        right = op.input[1]

        assert (
            op.on is not None
        ), "Merge with unspecified 'on' parameter is not supported in the engine"

        for col in op.on:
            assert (
                col in left._table_cols and col in right._table_cols
            ), f"Column '{col}'' is missing in one of merge operands"

        """ Join, only equal-join supported """
        cmps = [self._ref(left, c).eq(self._ref(right, c)) for c in op.on]
        if len(cmps) > 1:
            condition = OpExpr("AND", cmps, get_dtype(bool))
        else:
            condition = cmps[0]
        node = CalciteJoinNode(
            left_id=self._input_node(0).id,
            right_id=self._input_node(1).id,
            how=op.how,
            condition=condition,
        )
        self._push(node)

        """Projection for both frames"""
        fields = []
        exprs = []
        conflicting_cols = set(left.columns) & set(right.columns) - set(op.on)
        """First goes 'on' column then all left columns(+suffix for conflicting names)
        but 'on' then all right columns(+suffix for conflicting names) but 'on'"""
        on_idx = [-1] * len(op.on)
        for c in left.columns:
            if c in op.on:
                on_idx[op.on.index(c)] = len(fields)
            suffix = op.suffixes[0] if c in conflicting_cols else ""
            fields.append(c + suffix)
            exprs.append(self._ref(left, c))

        for c in right.columns:
            if c not in op.on:
                suffix = op.suffixes[1] if c in conflicting_cols else ""
                fields.append(c + suffix)
                exprs.append(self._ref(right, c))

        self._push(CalciteProjectionNode(fields, exprs))

        # TODO: current input translation system doesn't work here
        # because there is no frame to reference for index computation.
        # We should build calcite tree to keep references to input
        # nodes and keep scheme in calcite nodes. For now just use
        # known index on_idx.
        if op.sort is True:
            """Sort by key column"""
            collation = [CalciteCollation(CalciteInputIdxExpr(x)) for x in on_idx]
            self._push(CalciteSortNode(collation))
Beispiel #14
0
    def floor(self):
        """
        Build a floor expression.

        Returns
        -------
        BaseExpr
            The resulting floor expression.
        """
        return OpExpr("FLOOR", [self], get_dtype(int))
Beispiel #15
0
    def is_not_null(self):
        """
        Build a NOT NULL check expression.

        Returns
        -------
        BaseExpr
            The NOT NULL check expression.
        """
        new_expr = OpExpr("IS NOT NULL", [self], get_dtype(bool))
        return new_expr
Beispiel #16
0
def _get_common_dtype(lhs_dtype, rhs_dtype):
    """
    Get data type for a binary operation result.

    Parameters
    ----------
    lhs_dtype : dtype
        The type of the first operand.
    rhs_dtype : dtype
        The type of the second operand.

    Returns
    -------
    dtype
        The result data type.
    """
    if lhs_dtype == rhs_dtype:
        return lhs_dtype
    if is_float_dtype(lhs_dtype) or is_float_dtype(rhs_dtype):
        return get_dtype(float)
    assert is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype)
    return get_dtype(int)
Beispiel #17
0
def _agg_dtype(agg, dtype):
    """
    Compute aggregate data type.

    Parameters
    ----------
    agg : str
        Aggregate name.
    dtype : dtype
        Operand data type.

    Returns
    -------
    dtype
        The aggregate data type.
    """
    if agg in _aggs_preserving_numeric_type:
        return dtype
    elif agg in _aggs_with_int_result:
        return get_dtype(int)
    elif agg in _aggs_with_float_result:
        return get_dtype(float)
    else:
        raise NotImplementedError(f"unsupported aggreagte {agg}")
Beispiel #18
0
    def bin_op(self, other, op_name):
        if op_name not in self.binary_operations:
            raise NotImplementedError(f"unsupported binary operation {op_name}")

        if is_cmp_op(op_name):
            return self._cmp_op(other, op_name)

        # True division may require prior cast to float to avoid integer division
        if op_name == "truediv":
            if is_integer_dtype(self._dtype) and is_integer_dtype(other._dtype):
                other = other.cast(get_dtype(float))
        res_type = self._get_bin_op_res_type(op_name, self._dtype, other._dtype)
        new_expr = OpExpr(self.binary_operations[op_name], [self, other], res_type)
        # Floor division may require additional FLOOR expr.
        if op_name == "floordiv" and not is_integer_dtype(res_type):
            return new_expr.floor()
        return new_expr
Beispiel #19
0
    def le(self, other):
        """
        Build a less or equal comparison with `other`.

        Parameters
        ----------
        other : BaseExpr or scalar
            An operand to compare with.

        Returns
        -------
        BaseExpr
            The resulting comparison expression.
        """
        if not isinstance(other, BaseExpr):
            other = LiteralExpr(other)
        new_expr = OpExpr("<=", [self, other], get_dtype(bool))
        return new_expr
Beispiel #20
0
def _get_counts_nanvar(
    value_counts: Tuple[int],
    mask: Optional[np.ndarray],
    axis: Optional[int],
    ddof: int,
    dtype: Dtype = float,
) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]:
    """
    Get the count of non-null values along an axis, accounting
    for degrees of freedom.

    Parameters
    ----------
    values_shape : Tuple[int]
        shape tuple from values ndarray, used if mask is None
    mask : Optional[ndarray[bool]]
        locations in values that should be considered missing
    axis : Optional[int]
        axis to count along
    ddof : int
        degrees of freedom
    dtype : type, optional
        type to use for count

    Returns
    -------
    count : scalar or array
    d : scalar or array
    """
    dtype = get_dtype(dtype)
    count = _get_counts(value_counts, mask, axis, dtype=dtype)
    d = count - dtype.type(ddof)

    # always return NaN, never inf
    if is_scalar(count):
        if count <= ddof:
            count = np.nan
            d = np.nan
    else:
        mask2: np.ndarray = count <= ddof
        if mask2.any():
            np.putmask(d, mask2, np.nan)
            np.putmask(count, mask2, np.nan)
    return count, d
Beispiel #21
0
def _get_counts(
    values_shape: Tuple[int, ...],
    mask: Optional[np.ndarray],
    axis: Optional[int],
    dtype: Dtype = float,
) -> Union[int, float, np.ndarray]:
    """
    Get the count of non-null values along an axis

    Parameters
    ----------
    values_shape : tuple of int
        shape tuple from values ndarray, used if mask is None
    mask : Optional[ndarray[bool]]
        locations in values that should be considered missing
    axis : Optional[int]
        axis to count along
    dtype : type, optional
        type to use for count

    Returns
    -------
    count : scalar or array
    """
    dtype = get_dtype(dtype)
    if axis is None:
        if mask is not None:
            n = mask.size - mask.sum()
        else:
            n = np.prod(values_shape)
        return dtype.type(n)

    if mask is not None:
        count = mask.shape[axis] - mask.sum(axis)
    else:
        count = values_shape[axis]

    if is_scalar(count):
        return dtype.type(count)
    try:
        return count.astype(dtype)
    except AttributeError:
        return np.array(count, dtype=dtype)
Beispiel #22
0
 def _cmp_op(self, other, op_name):
     lhs_dtype_class = self._get_dtype_cmp_class(self._dtype)
     rhs_dtype_class = self._get_dtype_cmp_class(other._dtype)
     res_dtype = get_dtype(bool)
     # In OmniSci comparison with NULL always results in NULL,
     # but in Pandas it is True for 'ne' comparison and False
     # for others.
     # Also Pandas allow 'eq' and 'ne' comparison for values
     # of incompatible types which doesn't work in OmniSci.
     if lhs_dtype_class != rhs_dtype_class:
         if op_name == "eq" or op_name == "ne":
             return LiteralExpr(op_name == "ne")
         else:
             raise TypeError(
                 f"Invalid comparison between {self._dtype} and {other._dtype}"
             )
     else:
         cmp = OpExpr(self.binary_operations[op_name], [self, other], res_dtype)
         return build_if_then_else(
             self.is_null(), LiteralExpr(op_name == "ne"), cmp, res_dtype
         )
Beispiel #23
0
def build_dt_expr(dt_operation, col_expr):
    """
    Build a datetime extraction expression.

    Parameters
    ----------
    dt_operation : str
        Datetime field to extract.
    col_expr : BaseExpr
        An expression to extract from.

    Returns
    -------
    BaseExpr
        The extract expression.
    """
    operation = LiteralExpr(dt_operation)

    res = OpExpr("PG_EXTRACT", [operation, col_expr], get_dtype(int))

    return res
Beispiel #24
0
    def _cmp_op(self, other, op_name):
        """
        Build a comparison expression.

        Parameters
        ----------
        other : BaseExpr
            A value to compare with.
        op_name : str
            The comparison operation name.

        Returns
        -------
        BaseExpr
            The resulting comparison expression.
        """
        lhs_dtype_class = self._get_dtype_cmp_class(self._dtype)
        rhs_dtype_class = self._get_dtype_cmp_class(other._dtype)
        res_dtype = get_dtype(bool)
        # In OmniSci comparison with NULL always results in NULL,
        # but in pandas it is True for 'ne' comparison and False
        # for others.
        # Also pandas allows 'eq' and 'ne' comparison for values
        # of incompatible types which doesn't work in OmniSci.
        if lhs_dtype_class != rhs_dtype_class:
            if op_name == "eq" or op_name == "ne":
                return LiteralExpr(op_name == "ne")
            else:
                raise TypeError(
                    f"Invalid comparison between {self._dtype} and {other._dtype}"
                )
        else:
            cmp = OpExpr(self.binary_operations[op_name], [self, other],
                         res_dtype)
            return build_if_then_else(self.is_null(),
                                      LiteralExpr(op_name == "ne"), cmp,
                                      res_dtype)
Beispiel #25
0
def build_row_idx_filter_expr(row_idx, row_col):
    """Build calcite expression to filter rows by rowid.

    Parameters
    ----------
    row_idx
        The row numeric indices to select
    row_col
        InputRefExpr referencing proper rowid column to filter by
    Returns
    -------
    CalciteBaseExpr
        A BaseExpr implementing filter condition
    """
    if not is_list_like(row_idx):
        return row_col.eq(row_idx)

    exprs = []
    for idx in row_idx:
        exprs.append(row_col.eq(idx))

    res = OpExpr("OR", exprs, get_dtype(bool))

    return res
Beispiel #26
0
def test_get_dtype_fails(input_param, expected_error_message):
    # python objects
    # 2020-02-02 npdev changed error message
    expected_error_message += f"|Cannot interpret '{input_param}' as a data type"
    with pytest.raises(TypeError, match=expected_error_message):
        com.get_dtype(input_param)
Beispiel #27
0
def test_get_dtype(input_param, result):
    assert com.get_dtype(input_param) == result
Beispiel #28
0
def build_dt_expr(dt_operation, col_expr):
    operation = LiteralExpr(dt_operation)

    res = OpExpr("PG_EXTRACT", [operation, col_expr], get_dtype(int))

    return res
Beispiel #29
0
 def le(self, other):
     if not isinstance(other, BaseExpr):
         other = LiteralExpr(other)
     new_expr = OpExpr("<=", [self, other], get_dtype(bool))
     return new_expr
Beispiel #30
0
 def is_null(self):
     new_expr = OpExpr("IS NULL", [self], get_dtype(bool))
     return new_expr