def __getitem__(self, obj): """Expression evaluation against groups. Given a single object or list, the GroupBy will interpret it as a set of SELECT expressions to evaluate in the context of the GROUP BY. Given a tuple of length 2, the first element will be interpreted for group selection (i.e., a HAVING clause), while the second element will be interpreted as a set of expressions to evaluate against the groups. """ ast = copy(self._grouped_ast) if isinstance(obj, tuple) and len(obj) == 2: if not isinstance(obj[0], Expr): raise ValueError("The group filter (obj[0]) must be Expr type") ast._having = obj[0] obj = obj[1] # obj is now the SELECT portion if not isinstance(obj, (list, tuple)): obj = [obj] select_list = [] for elt in obj: if isinstance(elt, SelectItem): select_list.append(elt) elif isinstance(elt, basestring): select_list.append(SelectItem(expr=Literal(elt))) elif isinstance(elt, Expr): select_list.append(SelectItem(expr=elt)) ast._select_list = select_list return BigDataFrame(self._ic, ast)
def join(self, other, on=None, how='inner', hint=None): """Join this BDF to another one. `on` is `None`, `string`, `Expr`, or `list[string]` """ left = InlineView(self._query_ast.to_sql(), 'left_tbl') right = InlineView(other._query_ast.to_sql(), 'right_tbl') # SELECT left.*, right.* select_list = [SelectItem(table_name=TableName(left.name)), SelectItem(table_name=TableName(right.name))] table_ref = JoinTableRef(left, right, on=on, op=how, hint=hint) ast = SelectStmt(select_list, table_ref) return BigDataFrame(self._ic, ast)
def from_sql_table(ic, table): """Create a BDF from a table name usable in Impala""" table_name = _to_TableName(table) table_ref = BaseTableRef(table_name) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple( [SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def from_sql_query(ic, query, alias=None): """Create a BDF from a SQL query executed by Impala""" query_alias = alias if alias else _random_id('inline_', 4) table_ref = InlineView(query, query_alias) schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql()) select_list = tuple( [SelectItem(expr=Literal(col)) for (col, ty) in schema]) return BigDataFrame(ic, SelectStmt(select_list, table_ref))
def take(self, n): """Return `n` rows as a pandas `DataFrame` Distributed and no notion of order, so not guaranteed to be reproducible. """ alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) # SELECT alias.* select_list = [SelectItem(table_name=TableName(table_ref.name))] limit_elt = LimitElement(Literal(n), None) ast = SelectStmt(select_list, table_ref, limit=limit_elt) bdf = BigDataFrame(self._ic, ast) return as_pandas(bdf.__iter__())
def groups(self): ast = copy(self._grouped_ast) select_list = [SelectItem(expr=e) for e in self._grouped_ast._group_by] ast._select_list = tuple(select_list) return BigDataFrame(self._ic, ast)