def _project(dataframe, project_q): if not project_q: return dataframe assert_list("project", project_q) if project_q == [["count"]]: # Special case for count only, ~equal to SQL count(*) return DataFrame.from_dict({"count": [len(dataframe)]}) aggregate_fns, alias_expressions = classify_expressions(project_q) if aggregate_fns and alias_expressions: raise_malformed("Cannot mix aliasing and aggregation functions", project_q) if isinstance(dataframe, DataFrameGroupBy): dataframe = _aggregate(dataframe, project_q, aggregate_fns) elif aggregate_fns: return _aggregate_without_group_by(dataframe, project_q, aggregate_fns) elif alias_expressions: dataframe = _alias(dataframe, alias_expressions) else: # Nothing to do here pass columns = [e if type(e) is not list else e[1] for e in project_q] try: return dataframe[columns] except KeyError: missing_columns = set(columns) - set(dataframe.columns.values) raise_malformed("Selected columns not in table", list(missing_columns))
def apply_operation(df, update_filter, column, op, value): # This is repetitive and ugly but the only way I've found to do in place updates if op == '+': df.ix[update_filter, column] += value elif op == '-': df.ix[update_filter, column] -= value elif op == '*': df.ix[update_filter, column] *= value elif op == '/': df.ix[update_filter, column] /= value elif op == '<<': df.ix[update_filter, column] <<= value elif op == '>>': df.ix[update_filter, column] >>= value elif op == '&': df.ix[update_filter, column] &= value elif op == '|': df.ix[update_filter, column] |= value elif op == '^': df.ix[update_filter, column] ^= value elif op == '%': df.ix[update_filter, column] %= value elif op == '**': df.ix[update_filter, column] **= value else: raise_malformed('Invalid update operator', (op, value, column))
def _like_filter(df, q): assert_len(q, 3) op, column, raw_expr = q if not is_quoted(raw_expr): raise_malformed("like expects a quoted string as second argument", q) regexp = unquote(raw_expr) if not regexp.startswith('%'): regexp = '^' + regexp else: regexp = regexp[1:] if not regexp.endswith('%'): regexp += '$' else: regexp = regexp[:-1] # 'like' is case sensitive, 'ilike' is case insensitive case = op == 'like' try: return df[column].str.contains(regexp, case=case) except AttributeError: raise_malformed("Invalid column type for (i)like", q)
def _aggregate(dataframe_group_by, project_q, aggregate_fns): if not aggregate_fns: raise_malformed("Aggregate function required when group_by is specified", project_q) try: return dataframe_group_by.agg(aggregate_fns) except AttributeError as e: functions = [fn_name for fn_name in aggregate_fns.values() if fn_name in str(e)] raise_malformed("Unknown aggregation function '{fn}'".format(fn=functions[0]), project_q)
def _group_by(dataframe, group_by_q): if not group_by_q: return dataframe assert_list("group_by", group_by_q) try: return dataframe.groupby(group_by_q, as_index=False) except KeyError: raise_malformed("Group by column not in table", group_by_q)
def _leaf_node(df, q): if isinstance(q, basestring): if is_quoted(q): return q[1:-1].encode('utf-8') try: return df[q] except KeyError: raise_malformed("Unknown column", q) return q
def classify_expressions(project_q): aggregate_functions = {} alias_expressions = [] for expression in project_q: if is_aggregate_function(expression): aggregate_functions[expression[1]] = expression[0] elif is_alias_assignment(expression): alias_expressions.append(expression) elif type(expression) is list: raise_malformed("Invalid expression in select", expression) return aggregate_functions, alias_expressions
def _join_filter(df, q): result = None if len(q) < 2: raise_malformed("Invalid number of arguments", q) elif len(q) == 2: # Conjunctions and disjunctions with only one clause are OK result = _do_pandas_filter(df, q[1]) else: result = reduce(lambda l, r: JOINING_OPERATORS[q[0]](l, _do_pandas_filter(df, r)), q[2:], _do_pandas_filter(df, q[1])) return result
def _bitwise_filter(df, q): assert_len(q, 3) op, column, arg = q if not isinstance(arg, (int, long)): raise_malformed('Invalid argument type, must be an integer:'.format(t=type(arg)), q) try: series = df[column] & arg if op == "any_bits": return series > 0 return series == arg except TypeError: raise_malformed("Invalid column type, must be an integer", q)
def _do_pandas_filter(df, q): if not isinstance(q, list): return _leaf_node(df, q) if not q: raise_malformed("Empty expression not allowed", q) result = None op = q[0] try: if op in ('any_bits', 'all_bits'): result = _bitwise_filter(df, q) elif op == "!": result = _not_filter(df, q) elif op == "isnull": result = _isnull_filter(df, q) elif op in COMPARISON_OPERATORS: result = _comparison_filter(df, q) elif op in JOINING_OPERATORS: result = _join_filter(df, q) elif op == 'in': result = _in_filter(df, q) elif op in ('like', 'ilike'): result = _like_filter(df, q) else: raise_malformed("Unknown operator", q) except KeyError: raise_malformed("Column is not defined", q) except TypeError: raise_malformed("Invalid type in argument", q) return result
def filter(self, dataframe, filter_q): if filter_q: assert_list('where', filter_q) filter_str = self._build_filter(filter_q) try: # The filter string may contain references to variables in env. # That's why it is defined here. env = self.env # noqa return dataframe.query(filter_str) except SyntaxError: raise_malformed('Syntax error in where clause', filter_q) except ValueError: raise_malformed('Invalid type in comparison in where clause', filter_q) return dataframe
def _order_by(dataframe, order_q): if not order_q: return dataframe assert_list("order_by", order_q) if not all(isinstance(c, basestring) for c in order_q): raise_malformed("Invalid order by format", order_q) columns = [e[1:] if e.startswith("-") else e for e in order_q] ascending = [not e.startswith("-") for e in order_q] try: return dataframe.sort_values(by=columns, ascending=ascending) except KeyError: raise_malformed("Order by column not in table", columns)
def _build_eval_expression(expr): if type(expr) is list: if len(expr) == 3: arg1 = _build_eval_expression(expr[1]) arg2 = _build_eval_expression(expr[2]) op = expr[0] return "({arg1} {op} {arg2})".format(arg1=arg1, op=op, arg2=arg2) if len(expr) == 2: arg1 = _build_eval_expression(expr[1]) op = expr[0] return "{op}({arg1})".format(op=op, arg1=arg1) raise_malformed("Invalid number of arguments", expr) return expr
def _aggregate_without_group_by(dataframe, project_q, aggregate_fns): if len(aggregate_fns) != len(project_q): raise_malformed("Cannot mix aggregation functions and columns without group_by clause", project_q) results = {} for column_name, fn_name in aggregate_fns.items(): # Intricate, apply the selected function to the selected column temp_dataframe = dataframe[[column_name]] fn = getattr(temp_dataframe, fn_name, None) if not fn or not callable(fn): raise_malformed("Unknown aggregation function", project_q) results[column_name] = [fn(axis=0)[0]] # The result must be a data frame return DataFrame.from_dict(results)
def classify_updates(q): # Updates can be either simple assignments or self referring updates (e. column += 1). # The former can be applied all at once while pandas only supports updates of one column # at the time for the latter. All updates are performed in the order they are declared # in the query. simple_run = [] for update in q['update']: if not isinstance(update, (list, tuple)): raise_malformed("Invalid update clause", update) if len(update) == 2: simple_run.append(update) else: if simple_run: yield ('simple', simple_run) simple_run = [] yield ('self-referring', update) if simple_run: yield ('simple', simple_run)
def _alias(dataframe, expressions): result_frame = dataframe for expression in expressions: destination, source = expression[1], expression[2] if not isinstance(destination, basestring): raise_malformed("Invalid alias, must be a string", expression) if not re.match(ALIAS_RE, destination): raise_malformed("Invalid alias, must match {alias}".format(alias=ALIAS_STRING), expression) eval_expr = _build_eval_expression(source) try: result_frame = result_frame.eval( "{destination} = {expr}".format(destination=destination, expr=eval_expr), inplace=False ) except (SyntaxError, ValueError): raise_malformed("Unknown function in alias", source) return result_frame
def _build_filter(self, q): result = None if type(q) is not list: return unicode(q) if not q: raise_malformed("Empty expression not allowed", q) op = q[0] if op == "!": assert_len(q, 2, "! is a single arity operator, invalid number of arguments") result = "not " + self._build_filter(q[1]) elif op == "isnull": assert_len(q, 2, "isnull is a single arity operator, invalid number of arguments") # Slightly hacky but the only way I've come up with so far. result = "({arg} != {arg})".format(arg=q[1]) elif op in COMPARISON_OPERATORS: assert_len(q, 3) _, arg1, arg2 = q result = self._build_filter(arg1) + " " + op + " " + self._build_filter(arg2) elif op in JOINING_OPERATORS: if len(q) < 2: raise_malformed("Invalid number of arguments", q) elif len(q) == 2: # Conjunctions and disjunctions with only one clause are OK result = self._build_filter(q[1]) else: result = ' {op} '.format(op=op).join(self._build_filter(x) for x in q[1:]) elif op == 'in': col_name, args = prepare_in_clause(q, FILTER_ENGINE_NUMEXPR) var_name = self._insert_in_env(args) result = '{col_name} in @env.{var_name}'.format(col_name=col_name, var_name=var_name) else: raise_malformed("Unknown operator", q) return "({result})".format(result=result)
def _build_update_filter(df, update_q): if type(update_q) is not list: raise_malformed("Expressions must be lists", update_q) if not update_q: raise_malformed("Empty expression not allowed", update_q) operator = update_q[0] if operator == "isnull": assert_len(update_q, 2, 'Invalid length of isnull query') try: return getattr(_prepare_arg(df, update_q[1]), 'isnull')() except AttributeError: raise_malformed("Unknown column for 'isnull'", update_q) if operator == "in": if len(update_q) != 3: raise_malformed("Invalid length of 'in' query", update_q) _, column, values = update_q if column not in df: raise_malformed("First argument to 'in' must be a column present in frame", update_q) if not isinstance(values, (list, tuple)): raise_malformed("Second argument to 'in' must be a list", update_q) return getattr(df, column).isin([_prepare_arg(df, val) for val in values]) if operator in COMPARISON_OPERATORS: arg1 = _prepare_arg(df, update_q[1]) arg2 = _prepare_arg(df, update_q[2]) return COMPARISON_OPERATORS[operator](arg1, arg2) raise_malformed("Unknown operator '{operator}'".format(operator=operator), update_q)