def decompSelects(self, selectList): decompList = [] for s in selectList: exprList = ExpressionInfo(s.selectExpr).decomposeCNF() for e in exprList: select = Select(None, e) decompList.append(select) return decompList
def placeSelect(self, accuSelect, curr, parent, result): newSelect = None if accuSelect: expr = "" for a in accuSelect: if len(expr) > 0: expr += "and" expr += a newSelect = Select(curr, expr) result.append((newSelect, parent)) return newSelect
def singlePushDown(self, operator): if operator.operatorType() == 'Select': selectOperator = operator selectOperator.subplan = self.singlePushDown( selectOperator.subPlan) subPlan = selectOperator.subPlan subplanType = subPlan.operatorType() if subplanType.endswith('Join'): lhsPlan = subPlan.lhsPlan rhsPlan = subPlan.rhsPlan lhsFields = lhsPlan.schema().fields rhsFields = rhsPlan.schema().fields sendToLeft = '' sendToRight = '' kept = '' selectExprs = ExpressionInfo( selectOperator.selectExpr).decomposeCNF() for selectExpr in selectExprs: attributes = ExpressionInfo(selectExpr).getAttributes() for attr in attributes: if attr in lhsFields: sendToLeft += selectExpr sendToLeft += ' and ' elif attr in rhsFields: sendToRight += selectExpr sendToRight += ' and ' else: kept += selectExpr kept += ' and ' if len(sendToLeft) > 0: sendToLeft = sendToLeft[:-5] selectOperator.subPlan.lhsPlan = self.singlePushDown( Select(selectOperator.subPlan.lhsPlan, sendToLeft)) if len(sendToRight) > 0: sendToRight = sendToRight[:-5] selectOperator.subPlan.rhsPlan = self.singlePushDown( Select(selectOperator.subPlan.rhsPlan, sendToRight)) if len(kept) > 0: kept = kept[:-5] return Select(selectOperator.subplan, kept) elif subplanType == 'UnionAll': subPlan.lhsPlan = self.singlePushDown( Select(subPlan.lhsPlan, selectOperator.selectExpr)) subPlan.rhsPlan = self.singlePushDown( Select(subPlan.rhsPlan, selectOperator.selectExpr)) else: # We only push down selects through joins and unions return selectOperator return selectOperator.subPlan # This is the very last return statement elif operator.operatorType() == 'Project': projectOperator = operator projectOperator.subPlan = self.singlePushDown( projectOperator.subPlan) subPlan = projectOperator.subPlan subplanType = subPlan.operatorType() if subplanType == 'Select': selectCriteria = ExpressionInfo( subPlan.selectExpr).getAttributes() for selection in selectCriteria: if selection not in operator.projectExprs: return operator operator.subPlan = operator.subPlan.subPlan operator.subPlan.subPlan = self.singlePushDown(operator) elif subplanType.endswith('Join'): lhsPlan = subPlan.lhsPlan rhsPlan = subPlan.rhsPlan lhsFields = lhsPlan.schema().fields rhsFields = rhsPlan.schema().fields sendToLeft = {} sendToRight = {} kept = {} projectExprs = projectOperator.projectExprs for key in projectExprs: if key in lhsFields: sendToLeft[key] = projectExprs[key] elif key in rhsFields: sendToRight[key] = projectExprs[key] else: kept[key] = projectExprs[key] if sendToLeft: projectOperator.subPlan.lhsPlan = self.singlePushDown( Project(projectOperator.subPlan.lhsPlan, sendToLeft)) if sendToRight: projectOperator.subPlan.rhsPlan = self.singlePushDown( Project(projectOperator.subPlan.rhsPlan, sendToRight)) if kept: return projectOperator # There are project Exprs that are not join predicates elif subplanType == 'UnionAll': subPlan.lhsPlan = self.singlePushDown( Project(subPlan.lhsPlan, projectOperator.projectExprs)) subPlan.rhsPlan = self.singlePushDown( Project(subPlan.rhsPlan, projectOperator.projectExprs)) else: return operator return projectOperator.subPlan elif operator.operatorType() == 'UnionAll' or operator.operatorType( ).endswith('Join'): operator.lhsPlan = self.singlePushDown(operator.lhsPlan) operator.rhsPlan = self.singlePushDown(operator.rhsPlan) return operator elif operator.operatorType() == 'GroupBy': operator.subPlan = self.singlePushDown(operator.subPlan) return operator else: return operator
def where(self, conditionExpr): if self.operator: return PlanBuilder(operator=Select(self.operator, conditionExpr)) else: raise ValueError("Invalid where clause")
def pushdownSelect(self, op): # First pushdown operators below: op.subPlan = self.pushdownOperator(op.subPlan) if op.subPlan.operatorType() in ["GroupBy", "TableScan", "Project"]: return op elif op.subPlan.operatorType() == "Select": # Reorder two selects based on 'score' #useEstimated = True #opScore = (1.0 - op.selectivity(useEstimated)) / op.tupleCost #childScore = (1.0 - op.subPlan.selectivity(useEstimated)) / op.tupleCost result = op #if childScore > myScore: result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "Sort": # Always move a select below a sort result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "UnionAll": # Place a copy of op on each side of the union result = op.subPlan result.lhsPlan = self.pushdownOperator( Select(result.lhsPlan, op.selectExpr)) result.rhsPlan = self.pushdownOperator( Select(result.rhsPlan, op.selectExpr)) return result elif "Join" in op.subPlan.operatorType(): # Partition the select expr as much as possible exprs = ExpressionInfo(op.selectExpr).decomposeCNF() lhsExprs = [] rhsExprs = [] remainingExprs = [] lhsAttrs = set(op.subPlan.lhsPlan.schema().fields) rhsAttrs = set(op.subPlan.rhsPlan.schema().fields) for e in exprs: attrs = ExpressionInfo(e).getAttributes() if attrs.issubset(lhsAttrs): lhsExprs.append(e) elif attrs.issubset(rhsAttrs): rhsExprs.append(e) else: remainingExprs.append(e) if lhsExprs: newLhsExpr = ' and '.join(lhsExprs) lhsSelect = Select(op.subPlan.lhsPlan, newLhsExpr) op.subPlan.lhsPlan = self.pushdownOperator(lhsSelect) if rhsExprs: newRhsExpr = ' and '.join(rhsExprs) rhsSelect = Select(op.subPlan.rhsPlan, newRhsExpr) op.subPlan.rhsPlan = self.pushdownOperator(rhsSelect) result = None if remainingExprs: newExpr = ' and '.join(remainingExprs) result = Select(op.subPlan, newExpr) else: result = op.subPlan return result else: print("Unmatched operatorType in pushdownOperator(): " + op.operatorType()) raise NotImplementedError
def pushdownSelections(self, operator): if operator.operatorType() == "TableScan": return operator elif (operator.operatorType() == "Project" or operator.operatorType() == "GroupBy"): newSubPlan = self.pushdownSelections(operator.subPlan) operator.subPlan = newSubPlan return operator elif (operator.operatorType() == "UnionAll" or operator.operatorType()[-4:] == "Join"): newlPlan = self.pushdownSelections(operator.lhsPlan) newrPlan = self.pushdownSelections(operator.rhsPlan) operator.lhsPlan = newlPlan operator.rhsPlan = newrPlan return operator else: # Here we deal with the Select Case # This is a lot harder than other cases subPlan = operator.subPlan # trivial case if subPlan.operatorType() == "TableScan": return operator # In this case we need to combine two selections elif subPlan.operatorType() == "Select": operator.selectExpr = "(" + operator.selectExpr + ")" + " and " + "(" + subPlan.selectExpr + ")" operator.subPlan = subPlan.subPlan del subPlan return self.pushdownSelections(operator) # We don't have to move selections through groupby since # groupby may create new field names elif subPlan.operatorType() == "GroupBy": newSubSubPlan = self.pushdownSelections(subPlan.subPlan) subPlan.subPlan = newSubSubPlan return operator elif subPlan.operatorType() == "UnionAll": subPlan.lhsPlan = Select(subPlan.lhsPlan, operator.selectExpr) subPlan.rhsPlan = Select(subPlan.rhsPlan, operator.selectExpr) subPlan.validateSchema() del operator return self.pushdownSelections(subPlan) # Some tricky behavior here. # We substitute all some tokens in selectExpr by the projectExpr. # However, here we only support some easy computations. We cannot # exhaustively test all the cases (all the math exprs) elif subPlan.operatorType() == "Project": selectExpr = operator.selectExpr for (k, (v1, _)) in subPlan.projectExprs.items(): selectExpr = selectExpr.replace(k, "(" + v1 + ")") operator.subPlan = subPlan.subPlan subPlan.subPlan = operator return self.pushdownSelections(subPlan) else: # Here we move the selections down to the Join Operator lhsPlanNames = subPlan.lhsPlan.schema().fields rhsPlanNames = subPlan.rhsPlan.schema().fields cnfExprList = ExpressionInfo( operator.selectExpr).decomposeCNF() lhsSelectExpr = "" rhsSelectExpr = "" remSelectExpr = "" for expr in cnfExprList: attributes = [] # filter attributes for var in ExpressionInfo(expr).getAttributes(): if (var in lhsPlanNames): attributes.append(var) if (var in rhsPlanNames): attributes.append(var) if self.isSubList(attributes, lhsPlanNames): if lhsSelectExpr == "": lhsSelectExpr += "(" + expr + ")" else: lhsSelectExpr += " and " + "(" + expr + ")" elif self.isSubList(attributes, rhsPlanNames): if rhsSelectExpr == "": rhsSelectExpr += "(" + expr + ")" else: rhsSelectExpr += " and " + "(" + expr + ")" else: if remSelectExpr == "": remSelectExpr += "(" + expr + ")" else: remSelectExpr += " and " + "(" + expr + ")" # push down selections if remSelectExpr == "": # A case that the selection all comes from lhsPlan if (lhsSelectExpr != "" and rhsSelectExpr == ""): operator.subPlan = subPlan.lhsPlan operator.selectExpr = lhsSelectExpr subPlan.lhsPlan = operator elif (rhsSelectExpr != "" and lhsSelectExpr == ""): operator.subPlan = subPlan.rhsPlan operator.selectExpr = rhsSelectExpr subPlan.rhsPlan = operator else: subPlan.lhsPlan = Select(subPlan.lhsPlan, lhsSelectExpr) subPlan.rhsPlan = Select(subPlan.rhsPlan, rhsSelectExpr) del operator return self.pushdownSelections(subPlan) else: operator.selectExpr = remSelectExpr if (lhsSelectExpr != "" and rhsSelectExpr == ""): subPlan.lhsPlan = Select(subPlan.lhsPlan, lhsSelectExpr) elif (rhsSelectExpr != "" and lhsSelectExpr == ""): subPlan.rhsPlan = Select(subPlan.rhsPlan, rhsSelectExpr) else: subPlan.lhsPlan = Select(subPlan.lhsPlan, lhsSelectExpr) subPlan.rhsPlan = Select(subPlan.rhsPlan, rhsSelectExpr) if subPlan.validateJoin(): subPlan.initializeSchema() operator.subPlan = self.pushdownSelections(subPlan) return operator
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r, self.db.relationSchema(r)) table.prepare(self.db) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while (len(worklist) > 1): combos = itertools.combinations(worklist, 2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops") join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops") join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops") join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops") if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost( True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() optDict = {} self.reportPlanCount = 0 for npass in range(1, len(relations) + 1): if npass == 1: for r in relations: table = TableScan(r, self.db.relationSchema(r)) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) optDict[(r, )] = Plan(root=select) else: optDict[(r, )] = Plan(root=table) self.reportPlanCount += 1 else: combinations = itertools.combinations(relations, npass) for c in combinations: fullList = sorted(c) clist = self.getCombos(fullList) bestJoin = None for subcombo in clist: complement = self.getComplement(fullList, subcombo) leftOps = optDict[tuple(complement)].root rightOps = optDict[tuple(subcombo)].root selectExpr = self.createExpression( complement, subcombo, selectTablesDict) joinExpr = self.createExpression( complement, subcombo, joinTablesDict) joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops") fullBnljOp = Select(joinBnljOp, selectExpr) if selectExpr == "True": joinBnlj = Plan(root=joinBnljOp) else: joinBnlj = Plan(root=fullBnljOp) joinBnlj.prepare(self.db) joinBnlj.sample(100) joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops") fullNljOp = Select(joinNljOp, selectExpr) if selectExpr == "True": joinNlj = Plan(root=joinNljOp) else: joinNlj = Plan(root=fullNljOp) joinNlj.prepare(self.db) joinNlj.sample(100) if joinBnlj.cost(True) < joinNlj.cost(True): if bestJoin == None or joinBnlj.cost( True) < bestJoin.cost(True): bestJoin = joinBnlj else: if bestJoin == None or joinNlj.cost( True) < bestJoin.cost(True): bestJoin = joinNlj self.reportPlanCount += 2 self.clearSampleFiles() optDict[tuple(fullList)] = bestJoin # after System R algorithm newPlan = optDict[tuple(sorted(relations))] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pushdownHelper(self, operator): #first determine operator type opertorType = operator.operatorType() #first check if valid operatorType if operatorType != "Project" and operatorType != "Select" and operatorType != "GroupBy" and operatorType != "Sort" and operatorType != "UnionAll" and operatorType[-4:] != "Join": return operator elif operatorType == "Project": operator.subPlan = self.pushdownHelper(operator.subPlan) subplanType = operator.subPlan.operatorType() #call second helper if subplanType == "Select": ''' Check keys - if not in keys, cannot pushdown anymore ''' for select in ExpressionInfo(operator.subPlan.selectExpr).getAttributes(): keys = operator.projectExprs.keys() if select not in keys: return operator operator.subPlan = operator.subPlan.subPlan operator.subPlan.subPlan = self.pushdownHelper(operator) elif subplanType[-4:] == "Join": items = operator.projectExprs.items() right = operator.subPlan.rhsPlan.schema().fields rightProject = {} left = operator.subPlan.lhsPlan.schema().fields leftProject = {} for (attribute, (expr, rand)) in items: pros = ExpressionInfo(expr)getAttributes() result = True #left for e in pros: if e not in left: result = False # if True if result: leftProject[attribute] = operator.projectExprs[attribute] continue #repeat with right now result = True for e in pros: if e not in right: result = False if result: rightProject[attribute] = operator.projectExprs[attribute] #end for #if left dictionary not empty #remember empty dic evaluates to false if leftProject: lPlan = operator.subPlan.lhsPlan operator.subPlan.lhsPlan = self.pushdownHelper(Project(lPlan, leftProject)) if rightProject: rPlan = operator.subPlan.rhsPlan operator.subPlan.rhsPlan = self.pushdownHelper(Project(rPlan, rightProject)) #length check - must be same size iIOT pushdown fullSize = len(operator.projectExprs) rightSize = len(rightProject) leftSize = len(leftProject) if fullSize != (rightSize + leftSize): return operator #end subPlan "Join" elif subplanType == "UnionAll": tempLeft = Project(operator.subPlan.lhsPlan) tempRight = Project(operator.subPlan.rhsPlan) operator.subPlan.lhsPlan = self.pushdownHelper(tempLeft, operator.projectExprs) operator.subPlan.rhsPlan = self.pushdownHelper(tempRight, operator.projectExprs) #else not Join or Union else: return operator return operator.subPlan #end "Project" #safety check above, so operatorType must be "Select" elif operatorType == "Select": #first part same as with "Project": subPlan pushdown operator.subPlan = self.pushdownHelper(operator.subPlan) subplanType = operator.subPlan.operatorType() if subplanType == "Sort" or "sort": operator.subPlan = operator.subPlan.subPlan operator.subPlan.subPlan = self.pushdownHelper(operator) elif subplanType[-4:] == "Join": selectExpress = ExpressionInfo(operator.selectExpr).decomposeCNF() left = operator.subPlan.lhsPlan.schema().fields right = operator.subPlan.rhsPlan.schema().fields leftExpress = [] leftAttributes = set(operator.subPlan.lhsPlan.schema().fields) rightAttributes = set(operator.subPlan.rhsPlan.schema().fields) rightExpress = [] unpushedExpress = [] for expr in selectExpress: select = ExpressionInfo(selectExpr).getAttributes() if select.issubset(leftAttributes): left.append(select) elif select.issubset(rightAttributes): right.append(select) else: unpushedExpress.append(select) if leftExpress: newExpression = ' and '.join(leftExpress) #lSelect op.subPlan.lhsPlan = self.pushdownHelper(Select(operator.subPlan.lhsPlan, newExpression)) if rightExpress: newExpression = ' and '.join(rightExpress) op.subPlan.rhsPlan = self.pushdownHelper(Select(operator.subPlan.rhsPlan, newExpression)) if unpushedExpress: return Select(operator.subPlan, ' and '.join(unpushedExpress)) else: return operator return operator.subPlan elif operatorType == "UnionAll" or operatorType[-4:] == "Join": operator.lhsPlan = self.pushdownHelper(operator.lhsPlan) operator.rhsPlan = self.pushdownHelper(operator.rhsPlan) return operator elif operatorType == "GroupBy" or operatorType == "Sort": operator.subPlan = self.pushdownHelper(operator.subPlan) return operator