Exemple #1
0
    def __init__(self, groupCols: List, parent: DataFrame):
        self.having = []
        self.groupCols = []
        computedAliases = [c.alias for c in parent.computedCols]
        for theCol in groupCols:
            if isinstance(theCol, str):
                theRef = None
                if theCol in computedAliases:
                    theRef = ColRef(theCol, None)
                else:
                    theRef = ColRef(theCol, self)
                theCol = theRef
            elif isinstance(theCol, ColRef):
                self.updateRef(theCol)
            elif isinstance(theCol, Expr):
                pass
            else:
                raise ExpressionException(
                    f"invalid grouping column type: {type(theCol)}")

            self.groupCols.append(theCol)

        self.aggFunc = []

        super().__init__(self.groupCols, parent,
                         GrizzlyGenerator._incrAndGetTupleVar())
Exemple #2
0
 def __init__(self, parent, other, on, how, comp):
     t = GrizzlyGenerator._incrAndGetTupleVar()
     super().__init__(parent.columns + other.columns, parent, t)
     self.right = other
     self.on = on
     self.how = how
     self.comp = comp
Exemple #3
0
 def first(self):
     tup = GrizzlyGenerator.fetchone(self)
     if len(tup) >= 1:
         return tup
     # elif len(tup) == 1:
     #   return tup
     else:
         return None
Exemple #4
0
    def __init__(self, table, index, schema):
        self.table = table
        alias = GrizzlyGenerator._incrAndGetTupleVar()

        if index and not (isinstance(index, str) or isinstance(index, list)):
            raise ValueError(
                "index definition must be a string or list of strings")

        super().__init__([], None, alias, index)
Exemple #5
0
 def __init__(self, file, colDefs, hasHeader, delimiter, format,
              fdw_extension_name):
     self.filenames = file
     self.colDefs = colDefs
     self.hasHeader = hasHeader
     self.delimiter = delimiter
     self.format = format
     self.fdw_extension_name = fdw_extension_name
     alias = GrizzlyGenerator._incrAndGetTupleVar()
     self.table = f"temp_ext_table{alias}"
     super().__init__([], None, alias)
Exemple #6
0
    def itertuples(self, name="Grizzly", index=None):
        '''
    Iterate over DataFrame rows as namedtuples.
    '''

        theIter = GrizzlyGenerator.iterator(self, includeHeader=True)

        headerRow = next(theIter)

        RowType = namedtuple(name, headerRow)

        for row in theIter:
            yield RowType._make(row)
Exemple #7
0
    def _generateAggCode(self, df, f) -> Tuple[List[str], str]:
        # aggregation over a table is performed in a way that the actual query
        # that was built is executed as an inner query and around that, we
        # compute the aggregation
        (pre, innerSQL) = self.generate(df)
        if df.parents:
            df.alias = GrizzlyGenerator._incrAndGetTupleVar()
            (fPre, funcCode) = self._generateFuncCall(f)
            aggSQL = f"SELECT {funcCode} FROM ({innerSQL}) as {df.alias}"

        else:
            (fPre, funcCode) = self._generateFuncCall(f)
            aggSQL = f"SELECT {funcCode} FROM {df.table} {df.alias}"

        return (pre + fPre, aggSQL)
Exemple #8
0
    def shape(self):
        '''
    Return a tuple representing the dimensionality of the DataFrame.

    (number of columns, number of rows)
    '''
        f = self.project(FuncCall("count", ["*"], None, "rowcount"))
        cc = ComputedCol(f, None)
        shapeDF = self.project(['*', cc])
        shapeDF = shapeDF.limit(1)

        resultRow = GrizzlyGenerator.fetchone(shapeDF)

        numCols = len(resultRow) - 1  # -1 because of added count
        numRows = resultRow[-1]  # last row would be the row count

        return (numCols, numRows)
    def _gen_agg(self, func, col, df):

        # aggregation over a table is performed in a way that the actual query
        # that was built is executed as an inner query and around that, we
        # compute the aggregation

        if df.parents:
            innerSQL = self.generate(df)
            df.alias = GrizzlyGenerator._incrAndGetTupleVar()
            funcCode = SQLGenerator._getFuncCode(func, col, df)
            aggSQL = f"SELECT {funcCode} FROM ({innerSQL}) as {df.alias}"
            # aggSQL = innerSQL
        else:
            funcCode = SQLGenerator._getFuncCode(func, col, df)
            aggSQL = f"SELECT {funcCode} FROM {df.table} {df.alias}"

        return aggSQL
Exemple #10
0
    def __init__(self, by: list, ascending: bool, parent):
        super().__init__(parent.columns, parent,
                         GrizzlyGenerator._incrAndGetTupleVar())

        sortCols = []
        for col in by:
            if isinstance(col, Projection):
                sortCols.append(self.updateRef(col.columns[0]))
            elif isinstance(col, str):
                sortCols.append(ColRef(col, self))
            elif isinstance(col, ColRef):
                sortCols.append(self.updateRef(col))
            else:
                raise ExpressionException(
                    f"unsupported type for oder by value: {type(col)}")

        self.by = sortCols
        self.ascending = ascending
Exemple #11
0
    def __init__(self, columns, parent: DataFrame, doDistinct=False):

        self.doDistinct = doDistinct

        if columns is None:
            columns = []
        elif not isinstance(columns, list):
            columns = [columns]

        # update references to all columns
        theCols = []
        for col in columns:
            theCol = self.updateRef(col)
            theCols.append(theCol)

        columns = theCols

        super().__init__(columns, parent,
                         GrizzlyGenerator._incrAndGetTupleVar())
Exemple #12
0
    def _exec_or_add_aggr(self, f: FuncCall):
        """
    Adaption to the nested query generation. If there is a grouping in the
    operator tree, the aggregation becomes a transformation. However, it must not
    become a new nested query, but needs to be attached to the grouping.
    If there is no grouping, the aggregation is an action, so execute the query.
    """

        # if isinstance(col, Projection):
        #   assert(len(col.columns) == 1)
        #   col = self.updateRef(col.columns[0])

        # if we are a grouping  and the function is not applied on a grouping column
        # then add the aggregation to the list...
        if isinstance(self, Grouping) and len([
                1 for fCol in f.inputCols for groupCol in self.groupCols
                if fCol.column == groupCol.column
        ]) == 0:
            self._addAggFunc(f)
            return self

        # otherwise execute f as an action
        return GrizzlyGenerator.aggregate(self, f)
Exemple #13
0
def close():
    GrizzlyGenerator.close()
Exemple #14
0
    def _buildFrom(self, df) -> Tuple[List[str], str, str]:

        if df is not None:

            computedCols = []
            preCode = []

            for x in df.computedCols:
                (exprPre, exprSQL) = self.generator._exprToSQL(x)
                preCode += exprPre
                computedCols.append(exprSQL)

            computedCols = ",".join(computedCols)

            if isinstance(df, Table):
                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                return (preCode, f"SELECT {proj} FROM {df.table} {df.alias}")

            elif isinstance(df, ExternalTable):
                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                tablePre = SQLGenerator._generateCreateExtTable(
                    df, self.generator.templates)
                qry = f"SELECT {proj} FROM {df.table} {df.alias}"

                return (preCode + tablePre, qry)

            elif isinstance(df, Projection):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                prefixed = "*"
                if df.columns:
                    prefixed = []

                    for attr in df.columns:
                        (ePre, exprSQL) = self.generator._exprToSQL(attr)

                        pre += ePre
                        prefixed.append(exprSQL)

                    prefixed = ",".join(prefixed)

                if computedCols:
                    prefixed += "," + computedCols

                qry = f"SELECT { 'DISTINCT ' if df.doDistinct else ''}{prefixed} FROM ({parentSQL}) {df.alias}"

                return (preCode + pre, qry)

            elif isinstance(df, Filter):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                (exprPre, exprStr) = self.generator._exprToSQL(df.expr)

                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                qry = f"SELECT {proj} FROM ({parentSQL}) {df.alias} WHERE {exprStr}"

                return (preCode + pre + exprPre, qry)

            elif isinstance(df, Join):

                lQry = Query(self.generator)
                (lpre, lparentSQL) = lQry._buildFrom(df.leftParent())

                rQry = Query(self.generator)
                (rpre, rparentSQL) = rQry._buildFrom(df.rightParent())

                lAlias = df.leftParent().alias
                rAlias = df.rightParent().alias

                if isinstance(df.on, ColRef):
                    (exprPre, onSQL) = self.generator._exprToSQL(df.on)
                    onSQL = f"USING ({onSQL})"
                    preCode += exprPre
                elif isinstance(df.on, LogicExpr) or isinstance(
                        df.on, BoolExpr):
                    (exprPre, onSQL) = self.generator._exprToSQL(df.on)
                    onSQL = "ON " + onSQL
                    preCode += exprPre
                elif isinstance(df.on, list):

                    if len(df.on) != 2:
                        raise ExpressionException(
                            "on condition must consist of exacltly two columns"
                        )

                    (lOnPre, lOn) = self.generator._exprToSQL(df.on[0])
                    (rOnPre, rOn) = self.generator._exprToSQL(df.on[1])

                    onSQL = f"ON {lOn} {df.comp} {rOn}"
                    preCode += lOnPre
                    preCode += rOnPre
                else:
                    onSQL = ""  # let the DB figure it out itself

                proj = "*"
                if computedCols:
                    proj += "," + computedCols

                joinSQL = f"SELECT {proj} FROM ({lparentSQL}) {lAlias} {df.how} JOIN ({rparentSQL}) {rAlias} {onSQL}"

                return (preCode + lpre + rpre, joinSQL)

            elif isinstance(df, Grouping):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                byCols = []
                for attr in df.groupCols:
                    (exprPre, exprSQL) = self.generator._exprToSQL(attr)
                    pre += exprPre
                    byCols.append(exprSQL)

                by = ",".join(byCols)

                funcCode = ""
                for f in df.aggFunc:
                    (fPre, fCode) = self.generator._generateFuncCall(f)
                    pre += fPre
                    funcCode += ", " + fCode

                groupSQL = f"SELECT {by} {funcCode} FROM ({parentSQL}) {df.alias} GROUP BY {by}"

                havings = []
                if df.having:
                    for h in df.having:
                        (hPre, hSQL) = self.generator._exprToSQL(h)
                        pre += hPre
                        havings.append(hSQL)

                    exprStr = " AND ".join(havings)
                    groupSQL += f" HAVING {exprStr}"

                #if the computed column is an aggregate over the groups,
                # it should not be added as an extra query, but rather
                # merged into this projection list
                if computedCols:
                    tVar = GrizzlyGenerator._incrAndGetTupleVar()
                    proj = "*," + computedCols
                    groupSQL = f"SELECT {proj} FROM {groupSQL} {tVar}"

                return (preCode + pre, groupSQL)

            elif isinstance(df, Limit):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                limitClause = self.generator.templates["limit"].lower()

                (lPre, limitExpr) = self.generator._exprToSQL(df.limit)
                pre += lPre

                limitSQL = None
                if limitClause == "top":
                    limitSQL = f"SELECT TOP {limitExpr} {df.alias}.* FROM ({parentSQL}) {df.alias}"
                elif limitClause == "limit":
                    limitSQL = f"SELECT {df.alias}.* FROM ({parentSQL}) {df.alias} LIMIT {limitExpr}"
                else:
                    raise ValueError(
                        f"Unknown keyword for LIMIT: {limitClause}")

                if df.offset is not None:
                    (oPre, offsetExpr) = self.generator._exprToSQL(df.offset)
                    pre += oPre
                    limitSQL += f" OFFSET {offsetExpr}"

                return (preCode + pre, limitSQL)

            elif isinstance(df, Ordering):
                subQry = Query(self.generator)
                (pre, parentSQL) = subQry._buildFrom(df.parents[0])

                by = []
                for attr in df.by:
                    (exprPre, exprSQL) = self.generator._exprToSQL(attr)
                    pre += exprPre
                    by.append(exprSQL)

                by = ",".join(by)
                direction = "ASC" if df.ascending else "DESC"

                qry = f"SELECT * FROM ({parentSQL}) {df.alias} ORDER BY {by} {direction}"

                return (preCode + pre, qry)

            else:
                raise ValueError(f"unsupported operator {type(df)}")

        else:
            return ""
    def _buildFrom(self, df):

        curr = df
        while curr is not None:

            if isinstance(curr, Table):
                self.table = f"{curr.table} {curr.alias}"

            elif isinstance(curr, Projection):
                if curr.attrs:
                    prefixed = [str(attr) for attr in curr.attrs]
                    if not self.projections:
                        self.projections = prefixed
                    else:
                        set(self.projections).intersection(set(prefixed))

                if curr.doDistinct:
                    self.doDistinct = True

            elif isinstance(curr, Filter):
                exprStr = self._exprToSQL(curr.expr)
                self.filters.append(exprStr)

            elif isinstance(curr, Join):

                if isinstance(curr.right, Table):
                    rightSQL = curr.right.table
                    rtVar = curr.right.alias
                else:
                    subQry = Query()
                    rightSQL = f"({subQry._buildFrom(curr.right)})"
                    rtVar = GrizzlyGenerator._incrAndGetTupleVar()
                    # curr.right.alias = rtVar
                    curr.right.setAlias(rtVar)

                if isinstance(curr.on, Expr):
                    onSQL = self._exprToSQL(curr.on)
                else:
                    onSQL = f"{curr.alias}.{curr.on[0]} {curr.comp} {rtVar}.{curr.on[1]}"

                joinSQL = f"{curr.how} JOIN {rightSQL} {rtVar} ON {onSQL}"
                self.joins.append(joinSQL)

            elif isinstance(curr, Grouping):
                self.groupcols = [str(attr) for attr in curr.groupCols]

                if curr.aggFunc:
                    (func, col) = curr.aggFunc
                    funcCode = SQLGenerator._getFuncCode(func, col, curr)
                    self.groupagg.add(funcCode)

            if curr.parents is None:
                curr = None
            else:
                curr = curr.parents[0]

        joins = ""
        while self.joins:
            joins += " " + self.joins.pop()

        projs = "*"
        if self.projections:
            if self.groupcols and not set(self.projections).issubset(
                    self.groupcols):
                raise ValueError(
                    "Projection list must be subset of group columns")

            projs = ', '.join(self.projections)

        grouping = ""
        if self.groupcols:
            theColRefs = ", ".join([str(e) for e in self.groupcols])
            grouping += f" GROUP BY {theColRefs}"

            if projs == "*":
                projs = theColRefs

        if len(self.groupagg) > 0:
            if projs == "*":
                projs = self.groupagg
            elif len(self.groupagg) > 0:
                projs = projs + "," + ",".join(self.groupagg)

        if self.doDistinct:
            projs = "distinct " + projs

        where = ""
        if len(self.filters) > 0:
            exprs = " AND ".join([str(e) for e in self.filters])
            where += f" WHERE {exprs}"

        qrySoFar = f"SELECT {projs} FROM {self.table}{joins}{where}{grouping}"
        return qrySoFar
Exemple #16
0
 def __init__(self, limit: int, offset: int, parent):
     super().__init__(parent.columns, parent,
                      GrizzlyGenerator._incrAndGetTupleVar())
     self.limit = limit
     self.offset = offset
Exemple #17
0
 def __iter__(self):
     return GrizzlyGenerator.iterator(self)
Exemple #18
0
 def collect(self, includeHeader=False):
     return GrizzlyGenerator.collect(self, includeHeader)
Exemple #19
0
 def __init__(self, expr: Expr, parent: DataFrame):
     super().__init__(parent.columns, parent,
                      GrizzlyGenerator._incrAndGetTupleVar())
     self.expr = self.updateRef(expr)
Exemple #20
0
 def show(self, pretty=False, delim=",", maxColWidth=20, limit=20):
     print(
         GrizzlyGenerator.toString(self, delim, pretty, maxColWidth, limit))
Exemple #21
0
 def generate(self):
     return GrizzlyGenerator.generate(self)