Ejemplo n.º 1
0
def _project(dataframe, project_q):
    if not project_q:
        return dataframe

    assert_list("project", project_q)

    if project_q == [["count"]]:
        # Special case for count only, ~equal to SQL count(*)
        return DataFrame.from_dict({"count": [len(dataframe)]})

    aggregate_fns, alias_expressions = classify_expressions(project_q)

    if aggregate_fns and alias_expressions:
        raise_malformed("Cannot mix aliasing and aggregation functions", project_q)

    if isinstance(dataframe, DataFrameGroupBy):
        dataframe = _aggregate(dataframe, project_q, aggregate_fns)
    elif aggregate_fns:
        return _aggregate_without_group_by(dataframe, project_q, aggregate_fns)
    elif alias_expressions:
        dataframe = _alias(dataframe, alias_expressions)
    else:
        # Nothing to do here
        pass

    columns = [e if type(e) is not list else e[1] for e in project_q]

    try:
        return dataframe[columns]
    except KeyError:
        missing_columns = set(columns) - set(dataframe.columns.values)
        raise_malformed("Selected columns not in table", list(missing_columns))
Ejemplo n.º 2
0
def apply_operation(df, update_filter, column, op, value):
    # This is repetitive and ugly but the only way I've found to do in place updates
    if op == '+':
        df.ix[update_filter, column] += value
    elif op == '-':
        df.ix[update_filter, column] -= value
    elif op == '*':
        df.ix[update_filter, column] *= value
    elif op == '/':
        df.ix[update_filter, column] /= value
    elif op == '<<':
        df.ix[update_filter, column] <<= value
    elif op == '>>':
        df.ix[update_filter, column] >>= value
    elif op == '&':
        df.ix[update_filter, column] &= value
    elif op == '|':
        df.ix[update_filter, column] |= value
    elif op == '^':
        df.ix[update_filter, column] ^= value
    elif op == '%':
        df.ix[update_filter, column] %= value
    elif op == '**':
        df.ix[update_filter, column] **= value
    else:
        raise_malformed('Invalid update operator', (op, value, column))
Ejemplo n.º 3
0
def _like_filter(df, q):
    assert_len(q, 3)
    op, column, raw_expr = q

    if not is_quoted(raw_expr):
        raise_malformed("like expects a quoted string as second argument", q)

    regexp = unquote(raw_expr)

    if not regexp.startswith('%'):
        regexp = '^' + regexp
    else:
        regexp = regexp[1:]

    if not regexp.endswith('%'):
        regexp += '$'
    else:
        regexp = regexp[:-1]

    # 'like' is case sensitive, 'ilike' is case insensitive
    case = op == 'like'

    try:
        return df[column].str.contains(regexp, case=case)
    except AttributeError:
        raise_malformed("Invalid column type for (i)like", q)
Ejemplo n.º 4
0
def _aggregate(dataframe_group_by, project_q, aggregate_fns):
    if not aggregate_fns:
        raise_malformed("Aggregate function required when group_by is specified", project_q)

    try:
        return dataframe_group_by.agg(aggregate_fns)
    except AttributeError as e:
        functions = [fn_name for fn_name in aggregate_fns.values() if fn_name in str(e)]
        raise_malformed("Unknown aggregation function '{fn}'".format(fn=functions[0]), project_q)
Ejemplo n.º 5
0
def _group_by(dataframe, group_by_q):
    if not group_by_q:
        return dataframe

    assert_list("group_by", group_by_q)

    try:
        return dataframe.groupby(group_by_q, as_index=False)
    except KeyError:
        raise_malformed("Group by column not in table", group_by_q)
Ejemplo n.º 6
0
def _leaf_node(df, q):
    if isinstance(q, basestring):
        if is_quoted(q):
            return q[1:-1].encode('utf-8')

        try:
            return df[q]
        except KeyError:
            raise_malformed("Unknown column", q)

    return q
Ejemplo n.º 7
0
def classify_expressions(project_q):
    aggregate_functions = {}
    alias_expressions = []
    for expression in project_q:
        if is_aggregate_function(expression):
            aggregate_functions[expression[1]] = expression[0]
        elif is_alias_assignment(expression):
            alias_expressions.append(expression)
        elif type(expression) is list:
            raise_malformed("Invalid expression in select", expression)

    return aggregate_functions, alias_expressions
Ejemplo n.º 8
0
def _join_filter(df, q):
    result = None
    if len(q) < 2:
        raise_malformed("Invalid number of arguments", q)
    elif len(q) == 2:
        # Conjunctions and disjunctions with only one clause are OK
        result = _do_pandas_filter(df, q[1])
    else:
        result = reduce(lambda l, r: JOINING_OPERATORS[q[0]](l, _do_pandas_filter(df, r)),
                        q[2:], _do_pandas_filter(df, q[1]))

    return result
Ejemplo n.º 9
0
def _bitwise_filter(df, q):
    assert_len(q, 3)
    op, column, arg = q
    if not isinstance(arg, (int, long)):
        raise_malformed('Invalid argument type, must be an integer:'.format(t=type(arg)), q)

    try:
        series = df[column] & arg
        if op == "any_bits":
            return series > 0
        return series == arg
    except TypeError:
        raise_malformed("Invalid column type, must be an integer", q)
Ejemplo n.º 10
0
def _do_pandas_filter(df, q):
    if not isinstance(q, list):
        return _leaf_node(df, q)

    if not q:
        raise_malformed("Empty expression not allowed", q)

    result = None
    op = q[0]
    try:
        if op in ('any_bits', 'all_bits'):
            result = _bitwise_filter(df, q)
        elif op == "!":
            result = _not_filter(df, q)
        elif op == "isnull":
            result = _isnull_filter(df, q)
        elif op in COMPARISON_OPERATORS:
            result = _comparison_filter(df, q)
        elif op in JOINING_OPERATORS:
            result = _join_filter(df, q)
        elif op == 'in':
            result = _in_filter(df, q)
        elif op in ('like', 'ilike'):
            result = _like_filter(df, q)
        else:
            raise_malformed("Unknown operator", q)
    except KeyError:
        raise_malformed("Column is not defined", q)
    except TypeError:
        raise_malformed("Invalid type in argument", q)

    return result
Ejemplo n.º 11
0
    def filter(self, dataframe, filter_q):
        if filter_q:
            assert_list('where', filter_q)
            filter_str = self._build_filter(filter_q)
            try:
                # The filter string may contain references to variables in env.
                # That's why it is defined here.
                env = self.env  # noqa
                return dataframe.query(filter_str)
            except SyntaxError:
                raise_malformed('Syntax error in where clause', filter_q)
            except ValueError:
                raise_malformed('Invalid type in comparison in where clause', filter_q)

        return dataframe
Ejemplo n.º 12
0
def _order_by(dataframe, order_q):
    if not order_q:
        return dataframe

    assert_list("order_by", order_q)
    if not all(isinstance(c, basestring) for c in order_q):
        raise_malformed("Invalid order by format", order_q)

    columns = [e[1:] if e.startswith("-") else e for e in order_q]
    ascending = [not e.startswith("-") for e in order_q]

    try:
        return dataframe.sort_values(by=columns, ascending=ascending)
    except KeyError:
        raise_malformed("Order by column not in table", columns)
Ejemplo n.º 13
0
def _build_eval_expression(expr):
    if type(expr) is list:
        if len(expr) == 3:
            arg1 = _build_eval_expression(expr[1])
            arg2 = _build_eval_expression(expr[2])
            op = expr[0]
            return "({arg1} {op} {arg2})".format(arg1=arg1, op=op, arg2=arg2)

        if len(expr) == 2:
            arg1 = _build_eval_expression(expr[1])
            op = expr[0]
            return "{op}({arg1})".format(op=op, arg1=arg1)

        raise_malformed("Invalid number of arguments", expr)

    return expr
Ejemplo n.º 14
0
def _aggregate_without_group_by(dataframe, project_q, aggregate_fns):
    if len(aggregate_fns) != len(project_q):
        raise_malformed("Cannot mix aggregation functions and columns without group_by clause", project_q)

    results = {}
    for column_name, fn_name in aggregate_fns.items():
        # Intricate, apply the selected function to the selected column
        temp_dataframe = dataframe[[column_name]]
        fn = getattr(temp_dataframe, fn_name, None)
        if not fn or not callable(fn):
            raise_malformed("Unknown aggregation function", project_q)

        results[column_name] = [fn(axis=0)[0]]

    # The result must be a data frame
    return DataFrame.from_dict(results)
Ejemplo n.º 15
0
def classify_updates(q):
    # Updates can be either simple assignments or self referring updates (e. column += 1).
    # The former can be applied all at once while pandas only supports updates of one column
    # at the time for the latter. All updates are performed in the order they are declared
    # in the query.
    simple_run = []
    for update in q['update']:
        if not isinstance(update, (list, tuple)):
            raise_malformed("Invalid update clause", update)

        if len(update) == 2:
            simple_run.append(update)
        else:
            if simple_run:
                yield ('simple', simple_run)
                simple_run = []
            yield ('self-referring', update)

    if simple_run:
        yield ('simple', simple_run)
Ejemplo n.º 16
0
def _alias(dataframe, expressions):
    result_frame = dataframe
    for expression in expressions:
        destination, source = expression[1], expression[2]
        if not isinstance(destination, basestring):
            raise_malformed("Invalid alias, must be a string", expression)

        if not re.match(ALIAS_RE, destination):
            raise_malformed("Invalid alias, must match {alias}".format(alias=ALIAS_STRING), expression)

        eval_expr = _build_eval_expression(source)
        try:
            result_frame = result_frame.eval(
                "{destination} = {expr}".format(destination=destination, expr=eval_expr), inplace=False
            )
        except (SyntaxError, ValueError):
            raise_malformed("Unknown function in alias", source)

    return result_frame
Ejemplo n.º 17
0
    def _build_filter(self, q):
        result = None
        if type(q) is not list:
            return unicode(q)

        if not q:
            raise_malformed("Empty expression not allowed", q)

        op = q[0]
        if op == "!":
            assert_len(q, 2, "! is a single arity operator, invalid number of arguments")
            result = "not " + self._build_filter(q[1])
        elif op == "isnull":
            assert_len(q, 2, "isnull is a single arity operator, invalid number of arguments")

            # Slightly hacky but the only way I've come up with so far.
            result = "({arg} != {arg})".format(arg=q[1])
        elif op in COMPARISON_OPERATORS:
            assert_len(q, 3)
            _, arg1, arg2 = q
            result = self._build_filter(arg1) + " " + op + " " + self._build_filter(arg2)
        elif op in JOINING_OPERATORS:
            if len(q) < 2:
                raise_malformed("Invalid number of arguments", q)
            elif len(q) == 2:
                # Conjunctions and disjunctions with only one clause are OK
                result = self._build_filter(q[1])
            else:
                result = ' {op} '.format(op=op).join(self._build_filter(x) for x in q[1:])
        elif op == 'in':
            col_name, args = prepare_in_clause(q, FILTER_ENGINE_NUMEXPR)
            var_name = self._insert_in_env(args)
            result = '{col_name} in @env.{var_name}'.format(col_name=col_name, var_name=var_name)
        else:
            raise_malformed("Unknown operator", q)

        return "({result})".format(result=result)
Ejemplo n.º 18
0
def _build_update_filter(df, update_q):
    if type(update_q) is not list:
        raise_malformed("Expressions must be lists", update_q)

    if not update_q:
        raise_malformed("Empty expression not allowed", update_q)

    operator = update_q[0]
    if operator == "isnull":
        assert_len(update_q, 2, 'Invalid length of isnull query')
        try:
            return getattr(_prepare_arg(df, update_q[1]), 'isnull')()
        except AttributeError:
            raise_malformed("Unknown column for 'isnull'", update_q)

    if operator == "in":
        if len(update_q) != 3:
            raise_malformed("Invalid length of 'in' query", update_q)

        _, column, values = update_q
        if column not in df:
            raise_malformed("First argument to 'in' must be a column present in frame", update_q)

        if not isinstance(values, (list, tuple)):
            raise_malformed("Second argument to 'in' must be a list", update_q)

        return getattr(df, column).isin([_prepare_arg(df, val) for val in values])

    if operator in COMPARISON_OPERATORS:
        arg1 = _prepare_arg(df, update_q[1])
        arg2 = _prepare_arg(df, update_q[2])
        return COMPARISON_OPERATORS[operator](arg1, arg2)

    raise_malformed("Unknown operator '{operator}'".format(operator=operator), update_q)