def __init__(self, groupCols: List, parent: DataFrame): self.having = [] self.groupCols = [] computedAliases = [c.alias for c in parent.computedCols] for theCol in groupCols: if isinstance(theCol, str): theRef = None if theCol in computedAliases: theRef = ColRef(theCol, None) else: theRef = ColRef(theCol, self) theCol = theRef elif isinstance(theCol, ColRef): self.updateRef(theCol) elif isinstance(theCol, Expr): pass else: raise ExpressionException( f"invalid grouping column type: {type(theCol)}") self.groupCols.append(theCol) self.aggFunc = [] super().__init__(self.groupCols, parent, GrizzlyGenerator._incrAndGetTupleVar())
def __init__(self, parent, other, on, how, comp): t = GrizzlyGenerator._incrAndGetTupleVar() super().__init__(parent.columns + other.columns, parent, t) self.right = other self.on = on self.how = how self.comp = comp
def first(self): tup = GrizzlyGenerator.fetchone(self) if len(tup) >= 1: return tup # elif len(tup) == 1: # return tup else: return None
def __init__(self, table, index, schema): self.table = table alias = GrizzlyGenerator._incrAndGetTupleVar() if index and not (isinstance(index, str) or isinstance(index, list)): raise ValueError( "index definition must be a string or list of strings") super().__init__([], None, alias, index)
def __init__(self, file, colDefs, hasHeader, delimiter, format, fdw_extension_name): self.filenames = file self.colDefs = colDefs self.hasHeader = hasHeader self.delimiter = delimiter self.format = format self.fdw_extension_name = fdw_extension_name alias = GrizzlyGenerator._incrAndGetTupleVar() self.table = f"temp_ext_table{alias}" super().__init__([], None, alias)
def itertuples(self, name="Grizzly", index=None): ''' Iterate over DataFrame rows as namedtuples. ''' theIter = GrizzlyGenerator.iterator(self, includeHeader=True) headerRow = next(theIter) RowType = namedtuple(name, headerRow) for row in theIter: yield RowType._make(row)
def _generateAggCode(self, df, f) -> Tuple[List[str], str]: # aggregation over a table is performed in a way that the actual query # that was built is executed as an inner query and around that, we # compute the aggregation (pre, innerSQL) = self.generate(df) if df.parents: df.alias = GrizzlyGenerator._incrAndGetTupleVar() (fPre, funcCode) = self._generateFuncCall(f) aggSQL = f"SELECT {funcCode} FROM ({innerSQL}) as {df.alias}" else: (fPre, funcCode) = self._generateFuncCall(f) aggSQL = f"SELECT {funcCode} FROM {df.table} {df.alias}" return (pre + fPre, aggSQL)
def shape(self): ''' Return a tuple representing the dimensionality of the DataFrame. (number of columns, number of rows) ''' f = self.project(FuncCall("count", ["*"], None, "rowcount")) cc = ComputedCol(f, None) shapeDF = self.project(['*', cc]) shapeDF = shapeDF.limit(1) resultRow = GrizzlyGenerator.fetchone(shapeDF) numCols = len(resultRow) - 1 # -1 because of added count numRows = resultRow[-1] # last row would be the row count return (numCols, numRows)
def _gen_agg(self, func, col, df): # aggregation over a table is performed in a way that the actual query # that was built is executed as an inner query and around that, we # compute the aggregation if df.parents: innerSQL = self.generate(df) df.alias = GrizzlyGenerator._incrAndGetTupleVar() funcCode = SQLGenerator._getFuncCode(func, col, df) aggSQL = f"SELECT {funcCode} FROM ({innerSQL}) as {df.alias}" # aggSQL = innerSQL else: funcCode = SQLGenerator._getFuncCode(func, col, df) aggSQL = f"SELECT {funcCode} FROM {df.table} {df.alias}" return aggSQL
def __init__(self, by: list, ascending: bool, parent): super().__init__(parent.columns, parent, GrizzlyGenerator._incrAndGetTupleVar()) sortCols = [] for col in by: if isinstance(col, Projection): sortCols.append(self.updateRef(col.columns[0])) elif isinstance(col, str): sortCols.append(ColRef(col, self)) elif isinstance(col, ColRef): sortCols.append(self.updateRef(col)) else: raise ExpressionException( f"unsupported type for oder by value: {type(col)}") self.by = sortCols self.ascending = ascending
def __init__(self, columns, parent: DataFrame, doDistinct=False): self.doDistinct = doDistinct if columns is None: columns = [] elif not isinstance(columns, list): columns = [columns] # update references to all columns theCols = [] for col in columns: theCol = self.updateRef(col) theCols.append(theCol) columns = theCols super().__init__(columns, parent, GrizzlyGenerator._incrAndGetTupleVar())
def _exec_or_add_aggr(self, f: FuncCall): """ Adaption to the nested query generation. If there is a grouping in the operator tree, the aggregation becomes a transformation. However, it must not become a new nested query, but needs to be attached to the grouping. If there is no grouping, the aggregation is an action, so execute the query. """ # if isinstance(col, Projection): # assert(len(col.columns) == 1) # col = self.updateRef(col.columns[0]) # if we are a grouping and the function is not applied on a grouping column # then add the aggregation to the list... if isinstance(self, Grouping) and len([ 1 for fCol in f.inputCols for groupCol in self.groupCols if fCol.column == groupCol.column ]) == 0: self._addAggFunc(f) return self # otherwise execute f as an action return GrizzlyGenerator.aggregate(self, f)
def close(): GrizzlyGenerator.close()
def _buildFrom(self, df) -> Tuple[List[str], str, str]: if df is not None: computedCols = [] preCode = [] for x in df.computedCols: (exprPre, exprSQL) = self.generator._exprToSQL(x) preCode += exprPre computedCols.append(exprSQL) computedCols = ",".join(computedCols) if isinstance(df, Table): proj = "*" if computedCols: proj += "," + computedCols return (preCode, f"SELECT {proj} FROM {df.table} {df.alias}") elif isinstance(df, ExternalTable): proj = "*" if computedCols: proj += "," + computedCols tablePre = SQLGenerator._generateCreateExtTable( df, self.generator.templates) qry = f"SELECT {proj} FROM {df.table} {df.alias}" return (preCode + tablePre, qry) elif isinstance(df, Projection): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) prefixed = "*" if df.columns: prefixed = [] for attr in df.columns: (ePre, exprSQL) = self.generator._exprToSQL(attr) pre += ePre prefixed.append(exprSQL) prefixed = ",".join(prefixed) if computedCols: prefixed += "," + computedCols qry = f"SELECT { 'DISTINCT ' if df.doDistinct else ''}{prefixed} FROM ({parentSQL}) {df.alias}" return (preCode + pre, qry) elif isinstance(df, Filter): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) (exprPre, exprStr) = self.generator._exprToSQL(df.expr) proj = "*" if computedCols: proj += "," + computedCols qry = f"SELECT {proj} FROM ({parentSQL}) {df.alias} WHERE {exprStr}" return (preCode + pre + exprPre, qry) elif isinstance(df, Join): lQry = Query(self.generator) (lpre, lparentSQL) = lQry._buildFrom(df.leftParent()) rQry = Query(self.generator) (rpre, rparentSQL) = rQry._buildFrom(df.rightParent()) lAlias = df.leftParent().alias rAlias = df.rightParent().alias if isinstance(df.on, ColRef): (exprPre, onSQL) = self.generator._exprToSQL(df.on) onSQL = f"USING ({onSQL})" preCode += exprPre elif isinstance(df.on, LogicExpr) or isinstance( df.on, BoolExpr): (exprPre, onSQL) = self.generator._exprToSQL(df.on) onSQL = "ON " + onSQL preCode += exprPre elif isinstance(df.on, list): if len(df.on) != 2: raise ExpressionException( "on condition must consist of exacltly two columns" ) (lOnPre, lOn) = self.generator._exprToSQL(df.on[0]) (rOnPre, rOn) = self.generator._exprToSQL(df.on[1]) onSQL = f"ON {lOn} {df.comp} {rOn}" preCode += lOnPre preCode += rOnPre else: onSQL = "" # let the DB figure it out itself proj = "*" if computedCols: proj += "," + computedCols joinSQL = f"SELECT {proj} FROM ({lparentSQL}) {lAlias} {df.how} JOIN ({rparentSQL}) {rAlias} {onSQL}" return (preCode + lpre + rpre, joinSQL) elif isinstance(df, Grouping): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) byCols = [] for attr in df.groupCols: (exprPre, exprSQL) = self.generator._exprToSQL(attr) pre += exprPre byCols.append(exprSQL) by = ",".join(byCols) funcCode = "" for f in df.aggFunc: (fPre, fCode) = self.generator._generateFuncCall(f) pre += fPre funcCode += ", " + fCode groupSQL = f"SELECT {by} {funcCode} FROM ({parentSQL}) {df.alias} GROUP BY {by}" havings = [] if df.having: for h in df.having: (hPre, hSQL) = self.generator._exprToSQL(h) pre += hPre havings.append(hSQL) exprStr = " AND ".join(havings) groupSQL += f" HAVING {exprStr}" #if the computed column is an aggregate over the groups, # it should not be added as an extra query, but rather # merged into this projection list if computedCols: tVar = GrizzlyGenerator._incrAndGetTupleVar() proj = "*," + computedCols groupSQL = f"SELECT {proj} FROM {groupSQL} {tVar}" return (preCode + pre, groupSQL) elif isinstance(df, Limit): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) limitClause = self.generator.templates["limit"].lower() (lPre, limitExpr) = self.generator._exprToSQL(df.limit) pre += lPre limitSQL = None if limitClause == "top": limitSQL = f"SELECT TOP {limitExpr} {df.alias}.* FROM ({parentSQL}) {df.alias}" elif limitClause == "limit": limitSQL = f"SELECT {df.alias}.* FROM ({parentSQL}) {df.alias} LIMIT {limitExpr}" else: raise ValueError( f"Unknown keyword for LIMIT: {limitClause}") if df.offset is not None: (oPre, offsetExpr) = self.generator._exprToSQL(df.offset) pre += oPre limitSQL += f" OFFSET {offsetExpr}" return (preCode + pre, limitSQL) elif isinstance(df, Ordering): subQry = Query(self.generator) (pre, parentSQL) = subQry._buildFrom(df.parents[0]) by = [] for attr in df.by: (exprPre, exprSQL) = self.generator._exprToSQL(attr) pre += exprPre by.append(exprSQL) by = ",".join(by) direction = "ASC" if df.ascending else "DESC" qry = f"SELECT * FROM ({parentSQL}) {df.alias} ORDER BY {by} {direction}" return (preCode + pre, qry) else: raise ValueError(f"unsupported operator {type(df)}") else: return ""
def _buildFrom(self, df): curr = df while curr is not None: if isinstance(curr, Table): self.table = f"{curr.table} {curr.alias}" elif isinstance(curr, Projection): if curr.attrs: prefixed = [str(attr) for attr in curr.attrs] if not self.projections: self.projections = prefixed else: set(self.projections).intersection(set(prefixed)) if curr.doDistinct: self.doDistinct = True elif isinstance(curr, Filter): exprStr = self._exprToSQL(curr.expr) self.filters.append(exprStr) elif isinstance(curr, Join): if isinstance(curr.right, Table): rightSQL = curr.right.table rtVar = curr.right.alias else: subQry = Query() rightSQL = f"({subQry._buildFrom(curr.right)})" rtVar = GrizzlyGenerator._incrAndGetTupleVar() # curr.right.alias = rtVar curr.right.setAlias(rtVar) if isinstance(curr.on, Expr): onSQL = self._exprToSQL(curr.on) else: onSQL = f"{curr.alias}.{curr.on[0]} {curr.comp} {rtVar}.{curr.on[1]}" joinSQL = f"{curr.how} JOIN {rightSQL} {rtVar} ON {onSQL}" self.joins.append(joinSQL) elif isinstance(curr, Grouping): self.groupcols = [str(attr) for attr in curr.groupCols] if curr.aggFunc: (func, col) = curr.aggFunc funcCode = SQLGenerator._getFuncCode(func, col, curr) self.groupagg.add(funcCode) if curr.parents is None: curr = None else: curr = curr.parents[0] joins = "" while self.joins: joins += " " + self.joins.pop() projs = "*" if self.projections: if self.groupcols and not set(self.projections).issubset( self.groupcols): raise ValueError( "Projection list must be subset of group columns") projs = ', '.join(self.projections) grouping = "" if self.groupcols: theColRefs = ", ".join([str(e) for e in self.groupcols]) grouping += f" GROUP BY {theColRefs}" if projs == "*": projs = theColRefs if len(self.groupagg) > 0: if projs == "*": projs = self.groupagg elif len(self.groupagg) > 0: projs = projs + "," + ",".join(self.groupagg) if self.doDistinct: projs = "distinct " + projs where = "" if len(self.filters) > 0: exprs = " AND ".join([str(e) for e in self.filters]) where += f" WHERE {exprs}" qrySoFar = f"SELECT {projs} FROM {self.table}{joins}{where}{grouping}" return qrySoFar
def __init__(self, limit: int, offset: int, parent): super().__init__(parent.columns, parent, GrizzlyGenerator._incrAndGetTupleVar()) self.limit = limit self.offset = offset
def __iter__(self): return GrizzlyGenerator.iterator(self)
def collect(self, includeHeader=False): return GrizzlyGenerator.collect(self, includeHeader)
def __init__(self, expr: Expr, parent: DataFrame): super().__init__(parent.columns, parent, GrizzlyGenerator._incrAndGetTupleVar()) self.expr = self.updateRef(expr)
def show(self, pretty=False, delim=",", maxColWidth=20, limit=20): print( GrizzlyGenerator.toString(self, delim, pretty, maxColWidth, limit))
def generate(self): return GrizzlyGenerator.generate(self)