def getExprDicts(self, plan, fieldDict): q = [] q.append(plan.root) selectTablesDict = { } # mapping of relation list to list of exprs using them: [A,B] -> [a < b, etc] joinTablesDict = {} # same thing but for joins, not selects while len(q) > 0: currNode = q.pop() if (currNode.operatorType() == "Select"): selectExprList = ExpressionInfo( currNode.selectExpr).decomposeCNF() for selectExpr in selectExprList: attrList = ExpressionInfo(selectExpr).getAttributes() sourceList = [] for attr in attrList: source = fieldDict[attr] if source not in sourceList: sourceList.append(source) sourceTuple = tuple(sorted(sourceList)) if sourceTuple not in selectTablesDict: selectTablesDict[sourceTuple] = [] selectTablesDict[sourceTuple].append(selectExpr) elif "Join" in currNode.operatorType(): joinExprList = ExpressionInfo(currNode.joinExpr).decomposeCNF() for joinExpr in joinExprList: attrList = ExpressionInfo(joinExpr).getAttributes() sourceList = [] for attr in attrList: source = fieldDict[attr] if source not in sourceList: sourceList.append(source) sourceTuple = tuple(sorted(sourceList)) if sourceTuple not in joinTablesDict: joinTablesDict[sourceTuple] = [] joinTablesDict[sourceTuple].append(joinExpr) if len(currNode.inputs()) > 1: q.append(currNode.lhsPlan) q.append(currNode.rhsPlan) elif len(currNode.inputs()) == 1: q.append(currNode.subPlan) return (joinTablesDict, selectTablesDict)
def reorderSelProj(self, operator): if operator.operatorType() == "TableScan": return operator elif (operator.operatorType() == "Project" or operator.operatorType() == "GroupBy"): operator.subPlan = self.reorderSelProj(operator.subPlan) return operator elif (operator.operatorType() == "UnionAll" or operator.operatorType()[-4:] == "Join"): operator.lhsPlan = self.reorderSelProj(operator.lhsPlan) operator.rhsPlan = self.reorderSelProj(operator.rhsPlan) return operator else: subPlan = operator.subPlan if subPlan.operatorType() == "Project": subSubPlan = subPlan.subPlan subSubOutput = [k for (k, v) in subSubPlan.schema().schema()] selectFields = [ v for v in ExpressionInfo( operator.selectExpr).getAttributes() ] # we can't filter selectFields because of the getAttributes weakness # We assume that we can prohibit Math.sqrt and etc here. if self.isSubList(selectFields, subSubOutput): operator.subPlan = subSubPlan subPlan.subPlan = operator return self.reorderSelProj(subPlan) else: operator.subPlan = self.reorderSelProj(operator.subPlan) return operator else: operator.subPlan = self.reorderSelProj(operator.subPlan) return operator
def decompSelects(self, selectList): decompList = [] for s in selectList: exprList = ExpressionInfo(s.selectExpr).decomposeCNF() for e in exprList: select = Select(None, e) decompList.append(select) return decompList
def relativeRelations(self, relIds, op): fields = ExpressionInfo(op.joinExpr).getAttributes() relativeR = set() for f in fields: for r in relIds: if f in self.db.relationSchema(r).fields: relativeR.add(r) break return frozenset(relativeR)
def get_best_join(self, candidates, required_joins): best_plan_cost = None best_plan = None for left, right in candidates: relevant_expr = None # Find the joinExpr corresponding to the current join candidate. If there is none, it's a # cartesian product. for join in required_joins: names = ExpressionInfo(join.joinExpr).getAttributes() if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names): relevant_expr = join.joinExpr break else: relevant_expr = 'True' # Construct a join plan for the current candidate, for each possible join algorithm. # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods. for algorithm in ["nested-loops", "block-nested-loops"]: test_plan = Plan(root = Join( lhsPlan = left, rhsPlan = right, method = algorithm, expr = relevant_expr )) # Prepare and run the plan in sampling mode, and get the estimated cost. test_plan.prepare(self.db) test_plan.sample(1.0) cost = test_plan.cost(estimated = True) # Update running best. if best_plan_cost is None or cost < best_plan_cost: best_plan_cost = cost best_plan = test_plan # Need to return the root operator rather than the plan itself, since it's going back into the # table. return best_plan.root
def decodeJoinExprs(self, operator): lst = [] if self.isUnaryPath(operator): return lst else: if operator.operatorType()[-4:] == "Join": # The Join type we support: # "nested-loops", "block-nested-loops", "hash" # indexed join cannot work. if operator.joinMethod == "nested-loops" or operator.joinMethod == "block-nested-loops": lst.append( tuple( ExpressionInfo(operator.joinExpr).getAttributes())) elif operator.joinMethod == "hash": lst.append((operator.lhsKeySchema.fields[0], operator.rhsKeySchema.fields[0])) else: raise ValueError( "Join method not supported by the optimizer") lst += self.decodeJoinExprs(operator.lhsPlan) lst += self.decodeJoinExprs(operator.rhsPlan) return lst
def singlePushDown(self, operator): if operator.operatorType() == 'Select': selectOperator = operator selectOperator.subplan = self.singlePushDown( selectOperator.subPlan) subPlan = selectOperator.subPlan subplanType = subPlan.operatorType() if subplanType.endswith('Join'): lhsPlan = subPlan.lhsPlan rhsPlan = subPlan.rhsPlan lhsFields = lhsPlan.schema().fields rhsFields = rhsPlan.schema().fields sendToLeft = '' sendToRight = '' kept = '' selectExprs = ExpressionInfo( selectOperator.selectExpr).decomposeCNF() for selectExpr in selectExprs: attributes = ExpressionInfo(selectExpr).getAttributes() for attr in attributes: if attr in lhsFields: sendToLeft += selectExpr sendToLeft += ' and ' elif attr in rhsFields: sendToRight += selectExpr sendToRight += ' and ' else: kept += selectExpr kept += ' and ' if len(sendToLeft) > 0: sendToLeft = sendToLeft[:-5] selectOperator.subPlan.lhsPlan = self.singlePushDown( Select(selectOperator.subPlan.lhsPlan, sendToLeft)) if len(sendToRight) > 0: sendToRight = sendToRight[:-5] selectOperator.subPlan.rhsPlan = self.singlePushDown( Select(selectOperator.subPlan.rhsPlan, sendToRight)) if len(kept) > 0: kept = kept[:-5] return Select(selectOperator.subplan, kept) elif subplanType == 'UnionAll': subPlan.lhsPlan = self.singlePushDown( Select(subPlan.lhsPlan, selectOperator.selectExpr)) subPlan.rhsPlan = self.singlePushDown( Select(subPlan.rhsPlan, selectOperator.selectExpr)) else: # We only push down selects through joins and unions return selectOperator return selectOperator.subPlan # This is the very last return statement elif operator.operatorType() == 'Project': projectOperator = operator projectOperator.subPlan = self.singlePushDown( projectOperator.subPlan) subPlan = projectOperator.subPlan subplanType = subPlan.operatorType() if subplanType == 'Select': selectCriteria = ExpressionInfo( subPlan.selectExpr).getAttributes() for selection in selectCriteria: if selection not in operator.projectExprs: return operator operator.subPlan = operator.subPlan.subPlan operator.subPlan.subPlan = self.singlePushDown(operator) elif subplanType.endswith('Join'): lhsPlan = subPlan.lhsPlan rhsPlan = subPlan.rhsPlan lhsFields = lhsPlan.schema().fields rhsFields = rhsPlan.schema().fields sendToLeft = {} sendToRight = {} kept = {} projectExprs = projectOperator.projectExprs for key in projectExprs: if key in lhsFields: sendToLeft[key] = projectExprs[key] elif key in rhsFields: sendToRight[key] = projectExprs[key] else: kept[key] = projectExprs[key] if sendToLeft: projectOperator.subPlan.lhsPlan = self.singlePushDown( Project(projectOperator.subPlan.lhsPlan, sendToLeft)) if sendToRight: projectOperator.subPlan.rhsPlan = self.singlePushDown( Project(projectOperator.subPlan.rhsPlan, sendToRight)) if kept: return projectOperator # There are project Exprs that are not join predicates elif subplanType == 'UnionAll': subPlan.lhsPlan = self.singlePushDown( Project(subPlan.lhsPlan, projectOperator.projectExprs)) subPlan.rhsPlan = self.singlePushDown( Project(subPlan.rhsPlan, projectOperator.projectExprs)) else: return operator return projectOperator.subPlan elif operator.operatorType() == 'UnionAll' or operator.operatorType( ).endswith('Join'): operator.lhsPlan = self.singlePushDown(operator.lhsPlan) operator.rhsPlan = self.singlePushDown(operator.rhsPlan) return operator elif operator.operatorType() == 'GroupBy': operator.subPlan = self.singlePushDown(operator.subPlan) return operator else: return operator
def checkJoins(self, joins, allAttributes, cachedLHS, cachedRHS): for join in joins: if join.joinMethod == 'hash': lhsCheck = join.lhsKeySchema.fields # This is a list rhsCheck = join.rhsKeySchema.fields # This is a list joinAttr = lhsCheck + rhsCheck contains = all(x in allAttributes for x in joinAttr) and any( y in cachedRHS.schema().fields for y in joinAttr) # print(lhsCheck, rhsCheck, contains) if contains: keySchema1 = DBSchema('keyschema1', [(lhsCheck[0], 'int')]) keySchema2 = DBSchema('keyschema2', [(rhsCheck[0], 'int')]) hashFn1 = 'hash(' + lhsCheck[0] + ') % 7' hashFn2 = 'hash(' + rhsCheck[0] + ') % 7' joinExpr = lhsCheck[0] + ' == ' + rhsCheck[0] if lhsCheck[0] in cachedLHS.schema().fields: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn1, 'lhsKeySchema': keySchema1, 'rhsHashFn': hashFn2, 'rhsKeySchema': keySchema2, 'joinExpr': joinExpr } else: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn2, 'lhsKeySchema': keySchema2, 'rhsHashFn': hashFn1, 'rhsKeySchema': keySchema1, 'joinExpr': joinExpr } elif join.joinExpr: joinAttr = ExpressionInfo(join.joinExpr).getAttributes() contains = all(x in allAttributes for x in joinAttr) if contains: joinAttr1 = joinAttr.pop() joinAttr2 = joinAttr.pop() try: joinAttr.pop() except KeyError: pass # print('There are only two attributes in this, as expected') keySchema1 = DBSchema('keySchema1', [([joinAttr1, 'int'])]) keySchema2 = DBSchema('keySchema1', [([joinAttr2, 'int'])]) hashFn1 = 'hash(' + joinAttr1 + ') % 7' hashFn2 = 'hash(' + joinAttr2 + ') % 7' joinExpr = join.joinExpr if joinAttr1 in cachedLHS.schema().fields: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn1, 'lhsKeySchema': keySchema1, 'rhsHashFn': hashFn2, 'rhsKeySchema': keySchema2, 'joinExpr': joinExpr } else: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn2, 'lhsKeySchema': keySchema2, 'rhsHashFn': hashFn1, 'rhsKeySchema': keySchema1, 'joinExpr': joinExpr } else: return False, None
def pushdownProject(self, op): # First pushdown operators below: op.subPlan = self.pushdownOperator(op.subPlan) if op.subPlan.operatorType() in ["GroupBy", "TableScan"]: return op elif op.subPlan.operatorType() == "Project": # Attempt to remove redundant projections: bools = [ op.subPlan.projectExprs[key][0].isAttribute() for key in op.projectExprs ] if False not in bools: op.subPlan = op.subPlan.subPlan return self.pushdownOperator(op) elif op.subPlan.operatorType() == "Select": # Move op below its subplan if op provides all attributes needed for the selectExpr selectAttrs = ExpressionInfo(op.subPlan.selectExpr).getAttributes() outputAttrs = set(op.projectExprs.keys()) result = op if selectAttrs.issubset(outputAttrs): result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "Sort": # TODO return op elif op.subPlan.operatorType() == "UnionAll": # Place a copy of op on each side of the union result = op.subPlan result.lhsPlan = self.pushdownOperator( Project(result.lhsPlan, op.projectExprs)) result.rhsPlan = self.pushdownOperator( Project(result.rhsPlan, op.projectExprs)) return result elif "Join" in op.subPlan.operatorType(): # Partition the projections among the input relations, as much as possible lhsAttrs = set(op.subPlan.lhsPlan.schema().fields) rhsAttrs = set(op.subPlan.rhsPlan.schema().fields) lhsProjectExprs = {} rhsProjectExprs = {} remainingProjectExprs = False for attr in op.projectExprs: requiredAttrs = ExpressionInfo( op.projectExprs[attr][0]).getAttributes() if requiredAttrs.issubset(lhsAttrs): lhsProjectExprs[attr] = op.projectExprs[attr] elif requiredAttrs.issubset(rhsAttrs): rhsProjectExprs[attr] = op.projectExprs[attr] else: remainingProjectExprs = True if lhsProjectExprs: op.subPlan.lhsPlan = self.pushdownOperator( Project(op.subPlan.lhsPlan, lhsProjectExprs)) if rhsProjectExprs: op.subPlan.rhsPlan = self.pushdownOperator( Project(op.subPlan.rhsPlan, rhsProjectExprs)) result = op # Remove op from the tree if there are no remaining project expressions, and each side of the join recieved a projection if not remainingProjectExprs and lhsProjectExprs and rhsProjectExprs: result = op.subPlan return result else: print("Unmatched operatorType in pushdownOperator(): " + op.operatorType()) raise NotImplementedError
def pushdownSelect(self, op): # First pushdown operators below: op.subPlan = self.pushdownOperator(op.subPlan) if op.subPlan.operatorType() in ["GroupBy", "TableScan", "Project"]: return op elif op.subPlan.operatorType() == "Select": # Reorder two selects based on 'score' #useEstimated = True #opScore = (1.0 - op.selectivity(useEstimated)) / op.tupleCost #childScore = (1.0 - op.subPlan.selectivity(useEstimated)) / op.tupleCost result = op #if childScore > myScore: result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "Sort": # Always move a select below a sort result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "UnionAll": # Place a copy of op on each side of the union result = op.subPlan result.lhsPlan = self.pushdownOperator( Select(result.lhsPlan, op.selectExpr)) result.rhsPlan = self.pushdownOperator( Select(result.rhsPlan, op.selectExpr)) return result elif "Join" in op.subPlan.operatorType(): # Partition the select expr as much as possible exprs = ExpressionInfo(op.selectExpr).decomposeCNF() lhsExprs = [] rhsExprs = [] remainingExprs = [] lhsAttrs = set(op.subPlan.lhsPlan.schema().fields) rhsAttrs = set(op.subPlan.rhsPlan.schema().fields) for e in exprs: attrs = ExpressionInfo(e).getAttributes() if attrs.issubset(lhsAttrs): lhsExprs.append(e) elif attrs.issubset(rhsAttrs): rhsExprs.append(e) else: remainingExprs.append(e) if lhsExprs: newLhsExpr = ' and '.join(lhsExprs) lhsSelect = Select(op.subPlan.lhsPlan, newLhsExpr) op.subPlan.lhsPlan = self.pushdownOperator(lhsSelect) if rhsExprs: newRhsExpr = ' and '.join(rhsExprs) rhsSelect = Select(op.subPlan.rhsPlan, newRhsExpr) op.subPlan.rhsPlan = self.pushdownOperator(rhsSelect) result = None if remainingExprs: newExpr = ' and '.join(remainingExprs) result = Select(op.subPlan, newExpr) else: result = op.subPlan return result else: print("Unmatched operatorType in pushdownOperator(): " + op.operatorType()) raise NotImplementedError
def pushdownProjections(self, operator): if operator.operatorType() == "TableScan": return operator elif (operator.operatorType() == "Select" or operator.operatorType() == "GroupBy"): newSubPlan = self.pushdownProjections(operator.subPlan) operator.subPlan = newSubPlan return operator elif (operator.operatorType() == "UnionAll" or operator.operatorType()[-4:] == "Join"): newlPlan = self.pushdownProjections(operator.lhsPlan) newrPlan = self.pushdownProjections(operator.rhsPlan) operator.lhsPlan = newlPlan operator.rhsPlan = newrPlan return operator else: subPlan = operator.subPlan if subPlan.operatorType() == "TableScan": return operator elif subPlan.operatorType() == "Select": subSubPlan = subPlan.subPlan operator.subPlan = subSubPlan subPlan.subPlan = operator return self.pushdownProjections(subPlan) elif subPlan.operatorType() == "GroupBy": newSubSubPlan = self.pushdownProjections(subPlan.subPlan) subPlan.subPlan = newSubSubPlan return operator elif subPlan.operatorType() == "Project": # Note that here we need to combine two projections # We assume that the upper projection must be based on the outputschema # of the lower one; subRepExp = { k: v1 for (k, (v1, _)) in subPlan.projectExprs.items() } newExpr = dict() # Combine projections # TODO: Here we don't guarantee 100% success of replacement for (k, (v1, v2)) in operator.projectExprs.items(): newV1 = v1 for (key, value) in subRepExp.items(): newV1 = newV1.replace(key, value) newExpr[k] = (newV1, v2) # Reorder the projection operator operator.projectExprs = newExpr operator.outputSchema = DBSchema(operator.relationId(), \ [(k, v[1]) for (k,v) in operator.projectExprs.items()]) operator.subPlan = subPlan.subPlan return self.pushdownProjections(operator) elif subPlan.operatorType() == "UnionAll": # For Union operator, the push down is very simple subPlan.lhsPlan = Project(subPlan.lhsPlan, operator.projectExprs) subPlan.rhsPlan = Project(subPlan.rhsPlan, operator.projectExprs) subPlan.validateSchema() del operator return self.pushdownProjections(subPlan) else: # Here we deal with the Join Case # This is a lot harder than other cases # The first step is to collect input fields needed directly. # We grab out the fields in the projectExprs first # and then filter them with the project inputSchema fields = set() outputNames = [ k for (k, (v1, _)) in operator.projectExprs.items() ] inputNames = operator.inputSchemas()[0].fields lhsPlanNames = subPlan.lhsPlan.schema().fields rhsPlanNames = subPlan.rhsPlan.schema().fields for (k, (v1, _)) in operator.projectExprs.items(): attributes = ExpressionInfo(v1).getAttributes() # filter attributes for name in attributes: if name not in inputNames: attributes.remove(name) fields = fields.union(attributes) # collecting join condition fields; if subPlan.joinMethod == "nested-loops" or subPlan.joinMethod == "block-nested-loops": fields = fields.union( ExpressionInfo(subPlan.joinExpr).getAttributes()) elif subPlan.joinMethod == "hash": fields = fields.union( set(subPlan.lhsKeySchema.fields + subPlan.rhsKeySchema.fields)) else: # We don't support indexed raise NotImplementedError # constructing virtual l and r projections lprojectExpr = dict() rprojectExpr = dict() for (f, v) in subPlan.lhsPlan.schema().schema(): if f in fields: lprojectExpr[f] = (f, v) for (f, v) in subPlan.rhsPlan.schema().schema(): if f in fields: rprojectExpr[f] = (f, v) if len(lprojectExpr) != len(lhsPlanNames): subPlan.lhsPlan = Project(subPlan.lhsPlan, lprojectExpr) subPlan.lhsPlan.outputSchema = DBSchema(subPlan.lhsPlan.relationId(), \ [(k, v[1]) for (k,v) in subPlan.lhsPlan.projectExprs.items()]) if len(rprojectExpr) != len(rhsPlanNames): subPlan.rhsPlan = Project(subPlan.rhsPlan, rprojectExpr) subPlan.rhsPlan.outputSchema = DBSchema(subPlan.rhsPlan.relationId(), \ [(k, v[1]) for (k,v) in subPlan.rhsPlan.projectExprs.items()]) if subPlan.validateJoin(): subPlan.initializeSchema() # push down project through join operator.subPlan = self.pushdownProjections(subPlan) return operator
def selectPushDown(self, plan): root = plan.root selectResult = [] #New a stack and put info about op into it in the form of # (current op, parent op, accumulateSelect) queue = deque([(root, None, None)]) while queue: (curr, parent, accuSelect) = queue.popleft() children = curr.inputs() if children: #When dealing with Select, collect select expressions into accumulate select if isinstance(curr, Select): if not accuSelect: accuSelect = [] for decomp in ExpressionInfo(curr.selectExpr).decomposeCNF(): accuSelect.append(decomp) queue.extendleft([(children[0], curr, accuSelect)]) #Do not pushdown project at this point, so put it into result. #Accumulate select can always pass project elif isinstance(curr, Project): selectResult.append((curr, parent)) queue.extendleft([(children[0], curr, accuSelect)]) #When encountering a join, seperate the accumulate select expressions into three parts, #one part goes to left, one goes to right, and the remaining place above the join operator elif isinstance(curr, Join): leftSelect = [] rightSelect = [] newSelect = None leftFields = curr.lhsSchema.fields rightFields = curr.rhsSchema.fields put = [] if accuSelect: for a in accuSelect: f = ExpressionInfo(a).getAttributes() flag = False if set(f).issubset(set(leftFields)): leftSelect.append(a) flag = True if set(f).issubset(set(rightFields)): rightSelect.append(a) flag = True if not flag: put.append(a) if put: newSelect = self.placeSelect(put, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftSelect)]) queue.extendleft([(curr.rhsPlan, curr, rightSelect)]) #When encounter groupby, place all the accumulate select elif isinstance(curr, GroupBy): newSelect = self.placeSelect(accuSelect, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) queue.extendleft([(children[0], curr, None)]) #Deal with union similarly to join else: leftSelect = [] rightSelect = [] newSelect = None attrs = curr.unionSchema.fields put = [] if accuSelect: for a in accuSelect: f = ExpressionInfo(a).getAttributes() if set(f).issubset(set(attrs)): leftSelect.append(a) rightSelect.append(a) else: put.append(a) newSelect = self.placeSelect(accuSelect, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftSelect)]) queue.extendleft([(curr.rhsPlan, curr, rightSelect)]) #Deal with tablescan, place all the accumulate select else: newSelect = self.placeSelect(accuSelect, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) newRoot = selectResult[0][0] return Plan(root=newRoot)
def projectPushDown(self, plan): root = plan.root result = [] #Keep info in the form (current op, parent, accumulate Porject) queue = deque([(root, None, None)]) while queue: (curr, parent, accuProject) = queue.popleft() children = curr.inputs() if children: #Add current project into accumulate project if isinstance(curr, Project): if not accuProject: accuProject = curr.projectExprs else: accuProject.update({curr.projectExprs}) queue.extendleft([(children[0], curr, accuProject)]) elif isinstance(curr, Select): newProject = None if accuProject: selectAttrs = ExpressionInfo(curr.selectExpr).getAttributes() projectAttrs = self.getProjectAttrs(accuProject) newProject = Project(curr, accuProject) if set(selectAttrs).issubset(set(projectAttrs)): result.append((curr, parent)) queue.extendleft([(children[0], curr, accuProject)]) ''' #If considering the order of select and project: #Project can go through select #but if the selectivity of select is smaller, we do not let project pass curr.useSampling(sampled=True, sampleFactor=10.0) newProject.useSampling(sampled=True, sampleFactor=10.0) if curr.selectivity(estimated=True) < newProject.selectivity(estimated=True): result.append((newProject, parent)) result.append((curr, newProject)) queue.extendleft([(children[0], curr, None)]) else: result.append((curr, parent)) queue.extendleft([(children[0], curr, accuProject)]) ''' #If select operation has attributes that don't belongs to project #project has to stop here else: result.append((newProject, parent)) result.append((curr, newProject)) queue.extendleft([(children[0], curr, None)]) else: result.append((curr, parent)) queue.extendleft([(children[0], curr, accuProject)]) elif isinstance(curr, Join): #If we don't decompose project if accuProject: newProject = Project(curr, accuProject) result.append((newProject, parent)) result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, None)]) queue.extendleft([(curr.rhsPlan, curr, None)]) ''' #This part can be used to decompose project operation leftProject = {} rightProject = {} newProject = None leftFields = curr.lhsSchema.fields rightFields = curr.rhsSchema.fields put = {} if accuProject: projectAttrs = self.getProjectAttrs(accuProject) joinAttrs = ExpressionInfo(curr.joinExpr).getAttributes() if set(joinAttrs).issubset(set(projectAttrs)): for (k,v) in accuProject.items(): flag = False f = ExpressionInfo(k).getAttributes() if set(f).issubset(set(leftFields)): leftProject.update({k: v}) flag = True if set(f).issubset(set(rightFields)): rightProject.update({k: v}) flag = True if not flag: put.update({k: v}) if put: newProject = Project(curr, put) result.append((newProject, parent)) else: newProject = Project(curr, accuProject) result.append((newProject, parent)) if newProject: result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftProject)]) queue.extendleft([(curr.rhsPlan, curr, rightProject)]) ''' elif isinstance(curr, GroupBy): newProject = None if accuProject: newProject = Project(curr, accuProject) result.append((newProject, parent)) if newProject: result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(children[0], curr, None)]) else: #If we don't decompose project if accuProject: newProject = Project(curr, accuProject) result.append((newProject, parent)) result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, None)]) queue.extendleft([(curr.rhsPlan, curr, None)]) ''' #This part can be used to decompose project leftProject = {} rightProject = {} newProject = None attrs = curr.unionSchema.fields put = {} if accuProject: projectAttrs = self.getProjectAttrs(accuProject) if set(attrs).issubset(set(projectAttrs)): leftProject = accuProject rightProject = accuProject else: newProject = Project(curr, accuProject) result.append((newProject, parent)) if newProject: result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftProject)]) queue.extendleft([(curr.rhsPlan, curr, rightProject)]) ''' else: newProject = None if accuProject: newProject = Project(curr, accuProject) if newProject: result.append((newProject, parent)) result.append((curr, newProject)) else: result.append((curr, parent)) newRoot = result[0][0] return Plan(root=newRoot)
def pushdownSelect(self, op): # First pushdown operators below: op.subPlan = self.pushdownOperator(op.subPlan) if op.subPlan.operatorType() in ["GroupBy", "TableScan", "Project"]: return op elif op.subPlan.operatorType() == "Select": # Reorder two selects based on 'score' useEstimated = True opScore = (1 - op.selectivity(useEstimated)) / op.tupleCost childScore = (1 - op.subPlan.selectivity(useEstimated)) / op.tupleCost result = op if childScore > opScore: result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "Sort": # Always move a select below a sort result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "UnionAll": # Place a copy of op on each side of the union result = op.subPlan result.lhsPlan = self.pushdownOperator(Select(result.lhsPlan, op.selectExpr)) result.rhsPlan = self.pushdownOperator(Select(result.rhsPlan, op.selectExpr)) return result elif "Join" in op.subPlan.operatorType(): # Partition the select expr as much as possible exprs = ExpressionInfo(op.selectExpr).decomposeCNF() lhsExprs = [] rhsExprs = [] remainingExprs = [] lhsAttrs = set(op.subPlan.lhsPlan.schema().fields) rhsAttrs = set(op.subPlan.rhsPlan.schema().fields) for e in exprs: attrs = ExpressionInfo(e).getAttributes() if attrs.issubset(lhsAttrs): lhsExprs.append(e) elif attrs.issubset(rhsAttrs): rhsExprs.append(e) else: remainingExprs.append(e) if lhsExprs: newLhsExpr = ' and '.join(lhsExprs) lhsSelect = Select(op.subPlan.lhsPlan, newLhsExpr) op.subPlan.lhsPlan = self.pushdownOperator(lhsSelect) if rhsExprs: newRhsExpr = ' and '.join(rhsExprs) rhsSelect = Select(op.subPlan.rhsPlan, newRhsExpr) op.subPlan.rhsPlan = self.pushdownOperator(rhsSelect) result = None if remainingExprs: newExpr = ' and '.join(remainingExprs) result = Select(op.subPlan, newExpr) else: result = op.subPlan return result else: print("Unmatched operatorType in pushdownOperator(): " + op.operatorType()) raise NotImplementedError
def pushdownSelections(self, operator): if operator.operatorType() == "TableScan": return operator elif (operator.operatorType() == "Project" or operator.operatorType() == "GroupBy"): newSubPlan = self.pushdownSelections(operator.subPlan) operator.subPlan = newSubPlan return operator elif (operator.operatorType() == "UnionAll" or operator.operatorType()[-4:] == "Join"): newlPlan = self.pushdownSelections(operator.lhsPlan) newrPlan = self.pushdownSelections(operator.rhsPlan) operator.lhsPlan = newlPlan operator.rhsPlan = newrPlan return operator else: # Here we deal with the Select Case # This is a lot harder than other cases subPlan = operator.subPlan # trivial case if subPlan.operatorType() == "TableScan": return operator # In this case we need to combine two selections elif subPlan.operatorType() == "Select": operator.selectExpr = "(" + operator.selectExpr + ")" + " and " + "(" + subPlan.selectExpr + ")" operator.subPlan = subPlan.subPlan del subPlan return self.pushdownSelections(operator) # We don't have to move selections through groupby since # groupby may create new field names elif subPlan.operatorType() == "GroupBy": newSubSubPlan = self.pushdownSelections(subPlan.subPlan) subPlan.subPlan = newSubSubPlan return operator elif subPlan.operatorType() == "UnionAll": subPlan.lhsPlan = Select(subPlan.lhsPlan, operator.selectExpr) subPlan.rhsPlan = Select(subPlan.rhsPlan, operator.selectExpr) subPlan.validateSchema() del operator return self.pushdownSelections(subPlan) # Some tricky behavior here. # We substitute all some tokens in selectExpr by the projectExpr. # However, here we only support some easy computations. We cannot # exhaustively test all the cases (all the math exprs) elif subPlan.operatorType() == "Project": selectExpr = operator.selectExpr for (k, (v1, _)) in subPlan.projectExprs.items(): selectExpr = selectExpr.replace(k, "(" + v1 + ")") operator.subPlan = subPlan.subPlan subPlan.subPlan = operator return self.pushdownSelections(subPlan) else: # Here we move the selections down to the Join Operator lhsPlanNames = subPlan.lhsPlan.schema().fields rhsPlanNames = subPlan.rhsPlan.schema().fields cnfExprList = ExpressionInfo( operator.selectExpr).decomposeCNF() lhsSelectExpr = "" rhsSelectExpr = "" remSelectExpr = "" for expr in cnfExprList: attributes = [] # filter attributes for var in ExpressionInfo(expr).getAttributes(): if (var in lhsPlanNames): attributes.append(var) if (var in rhsPlanNames): attributes.append(var) if self.isSubList(attributes, lhsPlanNames): if lhsSelectExpr == "": lhsSelectExpr += "(" + expr + ")" else: lhsSelectExpr += " and " + "(" + expr + ")" elif self.isSubList(attributes, rhsPlanNames): if rhsSelectExpr == "": rhsSelectExpr += "(" + expr + ")" else: rhsSelectExpr += " and " + "(" + expr + ")" else: if remSelectExpr == "": remSelectExpr += "(" + expr + ")" else: remSelectExpr += " and " + "(" + expr + ")" # push down selections if remSelectExpr == "": # A case that the selection all comes from lhsPlan if (lhsSelectExpr != "" and rhsSelectExpr == ""): operator.subPlan = subPlan.lhsPlan operator.selectExpr = lhsSelectExpr subPlan.lhsPlan = operator elif (rhsSelectExpr != "" and lhsSelectExpr == ""): operator.subPlan = subPlan.rhsPlan operator.selectExpr = rhsSelectExpr subPlan.rhsPlan = operator else: subPlan.lhsPlan = Select(subPlan.lhsPlan, lhsSelectExpr) subPlan.rhsPlan = Select(subPlan.rhsPlan, rhsSelectExpr) del operator return self.pushdownSelections(subPlan) else: operator.selectExpr = remSelectExpr if (lhsSelectExpr != "" and rhsSelectExpr == ""): subPlan.lhsPlan = Select(subPlan.lhsPlan, lhsSelectExpr) elif (rhsSelectExpr != "" and lhsSelectExpr == ""): subPlan.rhsPlan = Select(subPlan.rhsPlan, rhsSelectExpr) else: subPlan.lhsPlan = Select(subPlan.lhsPlan, lhsSelectExpr) subPlan.rhsPlan = Select(subPlan.rhsPlan, rhsSelectExpr) if subPlan.validateJoin(): subPlan.initializeSchema() operator.subPlan = self.pushdownSelections(subPlan) return operator
def pushdownProject(self, op): # First pushdown operators below: op.subPlan = self.pushdownOperator(op.subPlan) if op.subPlan.operatorType() in ["GroupBy", "TableScan"]: return op elif op.subPlan.operatorType() == "Project": # Attempt to remove redundant projections: bools = [op.subPlan.projectExprs[key][0].isAttribute() for key in op.projectExprs] if False not in bools: op.subPlan = op.subPlan.subPlan return self.pushdownOperator(op) elif op.subPlan.operatorType() == "Select": # Move op below its subplan if op provides all attributes needed for the selectExpr selectAttrs = ExpressionInfo(op.subPlan.selectExpr).getAttributes() outputAttrs = set(op.projectExprs.keys()) result = op if selectAttrs.issubset(outputAttrs): result = op.subPlan op.subPlan = result.subPlan result.subPlan = self.pushdownOperator(op) return result elif op.subPlan.operatorType() == "Sort": return op elif op.subPlan.operatorType() == "UnionAll": # Place a copy of op on each side of the union result = op.subPlan result.lhsPlan = self.pushdownOperator(Project(result.lhsPlan, op.projectExprs)) result.rhsPlan = self.pushdownOperator(Project(result.rhsPlan, op.projectExprs)) return result elif "Join" in op.subPlan.operatorType(): # Partition the projections among the input relations, as much as possible lhsAttrs = set(op.subPlan.lhsPlan.schema().fields) rhsAttrs = set(op.subPlan.rhsPlan.schema().fields) lhsProjectExprs = {} rhsProjectExprs = {} remainingProjectExprs = False for attr in op.projectExprs: requiredAttrs = ExpressionInfo(op.projectExprs[attr][0]).getAttributes() if requiredAttrs.issubset(lhsAttrs): lhsProjectExprs[attr] = op.projectExprs[attr] elif requiredAttrs.issubset(rhsAttrs): rhsProjectExprs[attr] = op.projectExprs[attr] else: remainingProjectExprs = True if lhsProjectExprs: op.subPlan.lhsPlan = self.pushdownOperator(Project(op.subPlan.lhsPlan, lhsProjectExprs)) if rhsProjectExprs: op.subPlan.rhsPlan = self.pushdownOperator(Project(op.subPlan.rhsPlan, rhsProjectExprs)) result = op # Remove op from the tree if there are no remaining project expressions, and each side of the join recieved a projection if not remainingProjectExprs and lhsProjectExprs and rhsProjectExprs: result = op.subPlan return result else: print("Unmatched operatorType in pushdownOperator(): " + op.operatorType()) raise NotImplementedError
def pushdownHelper(self, operator): #first determine operator type opertorType = operator.operatorType() #first check if valid operatorType if operatorType != "Project" and operatorType != "Select" and operatorType != "GroupBy" and operatorType != "Sort" and operatorType != "UnionAll" and operatorType[-4:] != "Join": return operator elif operatorType == "Project": operator.subPlan = self.pushdownHelper(operator.subPlan) subplanType = operator.subPlan.operatorType() #call second helper if subplanType == "Select": ''' Check keys - if not in keys, cannot pushdown anymore ''' for select in ExpressionInfo(operator.subPlan.selectExpr).getAttributes(): keys = operator.projectExprs.keys() if select not in keys: return operator operator.subPlan = operator.subPlan.subPlan operator.subPlan.subPlan = self.pushdownHelper(operator) elif subplanType[-4:] == "Join": items = operator.projectExprs.items() right = operator.subPlan.rhsPlan.schema().fields rightProject = {} left = operator.subPlan.lhsPlan.schema().fields leftProject = {} for (attribute, (expr, rand)) in items: pros = ExpressionInfo(expr)getAttributes() result = True #left for e in pros: if e not in left: result = False # if True if result: leftProject[attribute] = operator.projectExprs[attribute] continue #repeat with right now result = True for e in pros: if e not in right: result = False if result: rightProject[attribute] = operator.projectExprs[attribute] #end for #if left dictionary not empty #remember empty dic evaluates to false if leftProject: lPlan = operator.subPlan.lhsPlan operator.subPlan.lhsPlan = self.pushdownHelper(Project(lPlan, leftProject)) if rightProject: rPlan = operator.subPlan.rhsPlan operator.subPlan.rhsPlan = self.pushdownHelper(Project(rPlan, rightProject)) #length check - must be same size iIOT pushdown fullSize = len(operator.projectExprs) rightSize = len(rightProject) leftSize = len(leftProject) if fullSize != (rightSize + leftSize): return operator #end subPlan "Join" elif subplanType == "UnionAll": tempLeft = Project(operator.subPlan.lhsPlan) tempRight = Project(operator.subPlan.rhsPlan) operator.subPlan.lhsPlan = self.pushdownHelper(tempLeft, operator.projectExprs) operator.subPlan.rhsPlan = self.pushdownHelper(tempRight, operator.projectExprs) #else not Join or Union else: return operator return operator.subPlan #end "Project" #safety check above, so operatorType must be "Select" elif operatorType == "Select": #first part same as with "Project": subPlan pushdown operator.subPlan = self.pushdownHelper(operator.subPlan) subplanType = operator.subPlan.operatorType() if subplanType == "Sort" or "sort": operator.subPlan = operator.subPlan.subPlan operator.subPlan.subPlan = self.pushdownHelper(operator) elif subplanType[-4:] == "Join": selectExpress = ExpressionInfo(operator.selectExpr).decomposeCNF() left = operator.subPlan.lhsPlan.schema().fields right = operator.subPlan.rhsPlan.schema().fields leftExpress = [] leftAttributes = set(operator.subPlan.lhsPlan.schema().fields) rightAttributes = set(operator.subPlan.rhsPlan.schema().fields) rightExpress = [] unpushedExpress = [] for expr in selectExpress: select = ExpressionInfo(selectExpr).getAttributes() if select.issubset(leftAttributes): left.append(select) elif select.issubset(rightAttributes): right.append(select) else: unpushedExpress.append(select) if leftExpress: newExpression = ' and '.join(leftExpress) #lSelect op.subPlan.lhsPlan = self.pushdownHelper(Select(operator.subPlan.lhsPlan, newExpression)) if rightExpress: newExpression = ' and '.join(rightExpress) op.subPlan.rhsPlan = self.pushdownHelper(Select(operator.subPlan.rhsPlan, newExpression)) if unpushedExpress: return Select(operator.subPlan, ' and '.join(unpushedExpress)) else: return operator return operator.subPlan elif operatorType == "UnionAll" or operatorType[-4:] == "Join": operator.lhsPlan = self.pushdownHelper(operator.lhsPlan) operator.rhsPlan = self.pushdownHelper(operator.rhsPlan) return operator elif operatorType == "GroupBy" or operatorType == "Sort": operator.subPlan = self.pushdownHelper(operator.subPlan) return operator