def pickJoinOrder(self, plan): # Extract all base relations, along with any unary operators immediately above. base_relations = set(plan.root.inputs()) # Extract all joins in original plan, they serve as the set of joins actually necessary. joins = plan.root.joins() # Define the dynamic programming table. optimal_plans = {} # Establish optimal access paths. for relation in base_relations: optimal_plans[frozenset((relation,))] = relation # Fill in the table. for i in range(2, len(base_relations) + 1): for subset in itertools.combinations(base_relations, i): # Build the set of candidate joins. candidate_joins = set() for candidate_relation in subset: candidate_joins.add(( optimal_plans[frozenset(tuple_without(subset, candidate_relation))], optimal_plans[frozenset((candidate_relation,))] )) # Find the best of the candidate joins. optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins) # Reconstruct the best plan, prepare and return. final_plan = Plan(root = optimal_plans[frozenset(base_relations)]) final_plan.prepare(self.db) plan = final_plan return final_plan
def pickJoinOrder(self, plan): # Extract all base relations, along with any unary operators immediately above. base_relations = set(plan.sources) # Extract all joins in original plan, they serve as the set of joins actually necessary. joins = set(plan.joins) # Define the dynamic programming table. optimal_plans = {} # Establish optimal access paths. for relation in base_relations: optimal_plans[frozenset((relation,))] = relation # Fill in the table. for i in range(2, len(base_relations) + 1): for subset in itertools.combinations(base_relations, i): # Build the set of candidate joins. candidate_joins = set() for candidate_relation in subset: candidate_joins.add(( optimal_plans[frozenset(tuple_without(subset, candidate_relation))], optimal_plans[frozenset((candidate_relation,))] )) # Find the best of the candidate joins. optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins) # Reconstruct the best plan, prepare and return. final_plan = Plan(root = optimal_plans[frozenset(base_relations)]) final_plan.prepare(self.db) return final_plan
def pickJoinOrder(self, plan): rels = set(plan.relations()) optPlans = {} #Map a set of relations to the optimized plan #toBeProcessed = [] #Set of relations pending processing self.combsTried = 0 self.plansProcessed = 0 for r in rels: set_r = frozenset({r}) #toBeProcessed.append(set_r) newScan = TableScan(r, self.db.relationSchema(r)) newScan.prepare(self.db) optPlans[set_r] = newScan #For each join operator, fetch its relative relations #Map a set of relations to (relative relations, operator) joinMap = {} for (_, op) in plan.flatten(): if isinstance(op, Join): relativeR = self.relativeRelations(rels, op) for r in [frozenset({r}) for r in relativeR]: if r in joinMap.keys(): joinMap[r].append((relativeR, op)) else: joinMap[r] = [(relativeR, op)] n = len(rels) for i in range(2, n + 1): for union in [frozenset(union) for union in self.kRelsComb(i, rels)]: for right in [frozenset(right) for right in self.kRelsComb(1, union)]: left = frozenset(union - right) for t in left: self.combsTried += 1 value = joinMap[frozenset({t})] if not value: continue else: for tuple in value: if not (set(tuple[0]).issubset(union) and left in optPlans and right in optPlans): continue self.plansProcessed += 1 newJoin = Join(optPlans[left], optPlans[right], expr=tuple[1].joinExpr, method="block-nested-loops") newJoin.prepare(self.db) if not union in optPlans: optPlans[union] = newJoin self.addPlanCost(newJoin, newJoin.cost(estimated=True)) else: formerCost = self.getPlanCost(optPlans[union]) if newJoin.cost(estimated=True) < formerCost: optPlans[union] = newJoin self.addPlanCost(newJoin, newJoin.cost(estimated=True)) newRoot = optPlans[frozenset(rels)] return Plan(root=newRoot) '''
def allAccessPaths(self, operator): totals = Plan(root=operator).flatten() joins = [j for (_, j) in totals if j.operatorType()[-4:] == "Join"] accPs = [] for j in joins: accPs.append(j.lhsPlan) accPs.append(j.rhsPlan) return [p for p in accPs if self.isUnaryPath(p)]
def pickJoinOrder(self, plan): self.combsTried = 0 self.plansProcessed = 0 self.rels = set(plan.relations()) #toBeProcessed = set() self.tableScans = {} for r in self.rels: ts = TableScan(r, self.db.relationSchema(r)) ts.prepare(self.db) self.tableScans[frozenset({r})] = ts self.joinMap = {} for (_, op) in plan.flatten(): if isinstance(op, Join): relativeR = self.relativeRelations(self.rels, op) for r in [frozenset({r}) for r in relativeR]: if r in self.joinMap.keys(): self.joinMap[r].append((relativeR, op)) else: self.joinMap[r] = [(relativeR, op)] n = len(self.rels) currBestPlan = None formerBestPlan = None formerRels = None currRels = None for i in range(2, n + 1): currBestCost = float('inf') if i == 2: for left in [frozenset({left}) for left in self.rels]: (newCost, newJoin, newRels) = self.processJoin(self.tableScans[left], left) if newCost < currBestCost: currRels = newRels currBestPlan = newJoin currBestCost = newCost else: (newCost, newJoin, newRels) = self.processJoin(formerBestPlan, formerRels) if newCost < currBestCost: currRels = newRels currBestPlan = newJoin currBestCost = newCost formerBestPlan = currBestPlan currBestPlan = None formerRels = currRels currRels = None newRoot = formerBestPlan return Plan(root=newRoot)
def get_best_join(self, candidates, required_joins): best_plan_cost = None best_plan = None for left, right in candidates: relevant_expr = None # Find the joinExpr corresponding to the current join candidate. If there is none, it's a # cartesian product. for join in required_joins: names = ExpressionInfo(join.joinExpr).getAttributes() if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names): relevant_expr = join.joinExpr break else: relevant_expr = 'True' # Construct a join plan for the current candidate, for each possible join algorithm. # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods. for algorithm in ["nested-loops", "block-nested-loops"]: test_plan = Plan(root = Join( lhsPlan = left, rhsPlan = right, method = algorithm, expr = relevant_expr )) # Prepare and run the plan in sampling mode, and get the estimated cost. test_plan.prepare(self.db) test_plan.sample(1.0) cost = test_plan.cost(estimated = True) # Update running best. if best_plan_cost is None or cost < best_plan_cost: best_plan_cost = cost best_plan = test_plan # Need to return the root operator rather than the plan itself, since it's going back into the # table. return best_plan.root
def get_best_join(self, candidates, required_joins): best_plan_cost = None best_plan = None for left, right in candidates: relevant_expr = None # Find the joinExpr corresponding to the current join candidate. If there is none, it's a # cartesian product. for join in required_joins: names = ExpressionInfo(join.joinExpr).getAttributes() if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names): relevant_expr = join.joinExpr break else: relevant_expr = 'True' # Construct a join plan for the current candidate, for each possible join algorithm. # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods. for algorithm in ["nested-loops", "block-nested-loops", "hash"]: test_plan = Plan(root = Join( lhsPlan = left, rhsPlan = right, method = algorithm, expr = relevant_expr )) # Prepare and run the plan in sampling mode, and get the estimated cost. test_plan.prepare(self.db) test_plan.sample(5.0) cost = test_plan.cost(estimated = True) # Update running best. if best_plan_cost is None or cost < best_plan_cost: best_plan_cost = cost best_plan = test_plan # Need to return the root operator rather than the plan itself, since it's going back into the # table. return best_plan.root
def pickJoinOrder(self, plan): joins, tableIDs, optimalSubPlans, fields, nubPlan = self.optimizerSetup( plan) # Joins is a list of joins # TableIDs is a list of the operator on top of a tableScan or the scan itself (Select, Projcect) # optimalSubPlan is a dictionary where the key is the top operator ID (from TableID) and val is the operator # fields is a dictionary where key is top operator ID, val is the dictionary of fields if len(joins) == 0: return plan numTables = 2 while numTables <= len(tableIDs): print('NumTables: ', numTables) joinOrderings = itertools.combinations(tableIDs, numTables) # Check each ordering, check each join method # Start with two tables total # pick one as the LHS, one as the RHS for joinOrdering in joinOrderings: # This iterates through subsets of size numTables bestCost = 1e99 bestPlan = None for rhsID in joinOrdering: # Eventually we'll even iterate through swapping 2-joins lhsIDs = list(joinOrdering) lhsIDs.remove(rhsID) # Make this one the right side join lhsKey = frozenset(lhsIDs) # Key for optimalSubPlan dict rhsKey = frozenset([rhsID]) # Key for optimalSubPlan dict cachedLHS = optimalSubPlans[ lhsKey] if lhsKey in optimalSubPlans else None # Get the optimal subPlan cachedRHS = optimalSubPlans[ rhsKey] # Get the optimal subPlan if cachedLHS is None or cachedRHS is None: continue # Do we even care about doing this join? allAttributes = [] for lhsID in lhsIDs: allAttributes.extend(fields[frozenset([ lhsID ])]) # These are all the attributes in the join allAttributes.extend(fields[rhsKey]) contains, planDict = self.checkJoins( joins, allAttributes, cachedLHS, cachedRHS) # print('Got to Contains') if contains: # print('PlanDict: ', planDict) for joinMethod in [ "hash", "nested-loops", "block-nested-loops" ]: if joinMethod == "hash": # print('HashMethod') lhsPlan = cachedLHS rhsPlan = cachedRHS lhsHashFn = planDict['lhsHashFn'] rhsHashFn = planDict['rhsHashFn'] lhsKeySchema = planDict['lhsKeySchema'] rhsKeySchema = planDict['rhsKeySchema'] tryPlan = Plan( root=Join(method=joinMethod, lhsPlan=cachedLHS, lhsHashFn=lhsHashFn, lhsKeySchema=lhsKeySchema, rhsPlan=cachedRHS, rhsHashFn=rhsHashFn, rhsKeySchema=rhsKeySchema)) self.checkPlan = tryPlan tryPlan.prepare(self.db) tryPlan.sample(1.0) cost = tryPlan.cost(estimated=True) # print('HashCost: ', cost) else: # print(joinMethod) joinExpr = planDict['joinExpr'] tryPlan = Plan(root=Join(lhsPlan=cachedLHS, rhsPlan=cachedRHS, method=joinMethod, expr=joinExpr)) self.checkPlan = tryPlan tryPlan.prepare(self.db) tryPlan.sample(1.0) cost = tryPlan.cost(estimated=True) # print(joinMethod + ' Cost: ', cost) if cost < bestCost: bestCost = cost bestPlan = tryPlan newKey = frozenset(joinOrdering) self.addPlanCost(newKey, bestCost, bestPlan) optimalSubPlans[ newKey] = bestPlan.root if bestPlan is not None else None numTables += 1 nubPlan.subPlan = self.statsCache[frozenset(tableIDs)][1].root plan.prepare(self.db) return plan
def pushdownOperators(self, plan): root = plan.root newPlan = self.singlePushDown(root) return Plan(root=newPlan)
def pushdownOperators(self, plan): return Plan(root=self.pushdownOperator(plan.root))
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 10 defaultPartiNumber = 5 n = len(aPaths) planList = [] costList = [] # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' total cost as 0. planList.append(aPath) costList.append((numPages, 0)) # i = 2...n for i in range(1, n): # find all possible two way join in current planList # put the potential joins in potentialP # put the potential joins cost in potentialC m = len(planList) potentialP = [] potentialC = [] for j in range(0, m - 1): for k in range(j + 1, m): self.pcntr += 1 potentialP.append((planList[j], planList[k])) potentialC.append(3 * (costList[j][0] + costList[k][0]) + costList[j][1] + costList[k][1]) # find the cheapest joinable join (total cost) # build the join, remove the used two base plan and add the new join to planList # modify the costList as well while (potentialC): currC = min(potentialC) currP = potentialP[potentialC.index(currC)] potentialC.remove(currC) potentialP.remove(currP) if (self.joinable(operator, currP)): (lField, rField) = self.joinable(operator, currP) lhsSchema = currP[0].schema() rhsSchema = currP[1].schema() lKeySchema = DBSchema( 'left', [(f, t) for (f, t) in lhsSchema.schema() if f == lField]) rKeySchema = DBSchema( 'right', [(f, t) for (f, t) in rhsSchema.schema() if f == rField]) lHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) newJoin = Join(currP[0], currP[1], method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) newJoin.prepare(self.db) totalCost = currC cards = Plan(root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) id1 = planList.index(currP[0]) _ = planList.pop(id1) id2 = planList.index(currP[1]) _ = planList.pop(id2) planList.append(newJoin) _ = costList.pop(id1) _ = costList.pop(id2) costList.append((pages, totalCost)) break print("GreedyOptimizer plan considered: ", self.pcntr) return planList[0]
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() optDict = {} self.reportPlanCount = 0 for npass in range(1, len(relations) + 1): if npass == 1: for r in relations: table = TableScan(r, self.db.relationSchema(r)) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) optDict[(r, )] = Plan(root=select) else: optDict[(r, )] = Plan(root=table) self.reportPlanCount += 1 else: combinations = itertools.combinations(relations, npass) for c in combinations: fullList = sorted(c) clist = self.getCombos(fullList) bestJoin = None for subcombo in clist: complement = self.getComplement(fullList, subcombo) leftOps = optDict[tuple(complement)].root rightOps = optDict[tuple(subcombo)].root selectExpr = self.createExpression( complement, subcombo, selectTablesDict) joinExpr = self.createExpression( complement, subcombo, joinTablesDict) joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops") fullBnljOp = Select(joinBnljOp, selectExpr) if selectExpr == "True": joinBnlj = Plan(root=joinBnljOp) else: joinBnlj = Plan(root=fullBnljOp) joinBnlj.prepare(self.db) joinBnlj.sample(100) joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops") fullNljOp = Select(joinNljOp, selectExpr) if selectExpr == "True": joinNlj = Plan(root=joinNljOp) else: joinNlj = Plan(root=fullNljOp) joinNlj.prepare(self.db) joinNlj.sample(100) if joinBnlj.cost(True) < joinNlj.cost(True): if bestJoin == None or joinBnlj.cost( True) < bestJoin.cost(True): bestJoin = joinBnlj else: if bestJoin == None or joinNlj.cost( True) < bestJoin.cost(True): bestJoin = joinNlj self.reportPlanCount += 2 self.clearSampleFiles() optDict[tuple(fullList)] = bestJoin # after System R algorithm newPlan = optDict[tuple(sorted(relations))] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 50 defaultPartiNumber = 5 # build join constraint list; joinExprs = self.decodeJoinExprs(operator) # build a local plan-cost dict: prev = dict() curr = dict() n = len(aPaths) # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' totalcost as 0. self.addPlanCost(aPath, (numPages, 0)) prev[aPath] = (numPages, 0) # i = 2...n for i in range(1, n): # build current list with prev. # For 2-way joins, we don't need to care left deep plan for p in prev.keys(): accP = self.allAccessPaths(p) remL = [item for item in aPaths if item not in accP] for base in remL: lhsSchema = p.schema() rhsSchema = base.schema() newJoin = None (sCostL, tCostL) = prev[p] (rPlan, costR) = self.getPlanCost(base) # Here we are using System-R 's heuristic to eliminate permutations as # much as possible. # Reference: Selinger, 1979, http://www.cs.berkeley.edu/~brewer/cs262/3-selinger79.pdf for (lField, rField) in joinExprs: if lField in lhsSchema.fields and rField in rhsSchema.fields: # Build Join # We only select hashjoin for building join plans # This is because the nested-loop-join contains a bug lKeySchema = DBSchema('left', [ (f, t) for (f, t) in lhsSchema.schema() if f == lField ]) rKeySchema = DBSchema('right', [ (f, t) for (f, t) in rhsSchema.schema() if f == rField ]) lHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) newJoin = Join(p, rPlan, method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) elif lField in rhsSchema.fields and rField in lhsSchema.fields: # Build Join # We only select hashjoin for building join plans # This is because the nested-loop-join contains a bug lKeySchema = DBSchema('left', [ (f, t) for (f, t) in rhsSchema.schema() if f == lField ]) rKeySchema = DBSchema('right', [ (f, t) for (f, t) in lhsSchema.schema() if f == rField ]) lHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) newJoin = Join(p, rPlan, method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=rKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=lKeySchema) else: continue if newJoin is not None: # Let's push newJoin onto the cache and curr list # cost: 3(M+N) + M's totalcost # then we renew newJoin's stepcost newJoin.prepare(self.db) stepCost = 3 * (sCostL + costR[0]) totalCost = stepCost + tCostL cards = Plan( root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) self.addPlanCost(newJoin, (pages, totalCost)) curr[newJoin] = (pages, totalCost) prev = curr curr = dict() del prev, curr return self.getPlanCost(operator)[0]
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r, self.db.relationSchema(r)) table.prepare(self.db) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while (len(worklist) > 1): combos = itertools.combinations(worklist, 2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops") join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops") join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops") join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops") if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost( True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def selectPushDown(self, plan): root = plan.root selectResult = [] #New a stack and put info about op into it in the form of # (current op, parent op, accumulateSelect) queue = deque([(root, None, None)]) while queue: (curr, parent, accuSelect) = queue.popleft() children = curr.inputs() if children: #When dealing with Select, collect select expressions into accumulate select if isinstance(curr, Select): if not accuSelect: accuSelect = [] for decomp in ExpressionInfo(curr.selectExpr).decomposeCNF(): accuSelect.append(decomp) queue.extendleft([(children[0], curr, accuSelect)]) #Do not pushdown project at this point, so put it into result. #Accumulate select can always pass project elif isinstance(curr, Project): selectResult.append((curr, parent)) queue.extendleft([(children[0], curr, accuSelect)]) #When encountering a join, seperate the accumulate select expressions into three parts, #one part goes to left, one goes to right, and the remaining place above the join operator elif isinstance(curr, Join): leftSelect = [] rightSelect = [] newSelect = None leftFields = curr.lhsSchema.fields rightFields = curr.rhsSchema.fields put = [] if accuSelect: for a in accuSelect: f = ExpressionInfo(a).getAttributes() flag = False if set(f).issubset(set(leftFields)): leftSelect.append(a) flag = True if set(f).issubset(set(rightFields)): rightSelect.append(a) flag = True if not flag: put.append(a) if put: newSelect = self.placeSelect(put, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftSelect)]) queue.extendleft([(curr.rhsPlan, curr, rightSelect)]) #When encounter groupby, place all the accumulate select elif isinstance(curr, GroupBy): newSelect = self.placeSelect(accuSelect, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) queue.extendleft([(children[0], curr, None)]) #Deal with union similarly to join else: leftSelect = [] rightSelect = [] newSelect = None attrs = curr.unionSchema.fields put = [] if accuSelect: for a in accuSelect: f = ExpressionInfo(a).getAttributes() if set(f).issubset(set(attrs)): leftSelect.append(a) rightSelect.append(a) else: put.append(a) newSelect = self.placeSelect(accuSelect, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftSelect)]) queue.extendleft([(curr.rhsPlan, curr, rightSelect)]) #Deal with tablescan, place all the accumulate select else: newSelect = self.placeSelect(accuSelect, curr, parent, selectResult) if newSelect: selectResult.append((curr, newSelect)) else: selectResult.append((curr, parent)) newRoot = selectResult[0][0] return Plan(root=newRoot)
def pickJoinOrder(self, plan): # Some restrictions apply: # 1. Cannot involve hash-joins or index-joins # 2. Only join operations beyond certain point (i.e. cannot have join -> aggregation -> join, etc.) tableIds = list() joinOps = list() optPlans = dict() fields = dict() firstOpWithJoins = self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields) if len(joinOps) == 0: return plan numTables = 2 while numTables <= len(tableIds): possibleJoinOrders = itertools.combinations(tableIds, numTables) for possibleJoinOrder in possibleJoinOrders: minCost = None optPlan = None for tableId in possibleJoinOrder: # Left-deep-only optimizer (i.e. rhs operand is a base relation) lhsIds = list(possibleJoinOrder) lhsIds.remove(tableId) lhsJoinKey = self.getJoinKey(lhsIds) rhsJoinKey = str(tableId) lhsOpt = optPlans[lhsJoinKey] if lhsJoinKey in optPlans else None rhsOpt = optPlans[rhsJoinKey] if rhsJoinKey in optPlans else None if lhsOpt is None or rhsOpt is None: continue # Skip irrelevant joins # Form a list of available attributes of this join allAttrs = list() for lhsId in lhsIds: allAttrs.extend(fields[lhsId]) allAttrs.extend(fields[tableId]) currJoinExpr = None # Check whether any join expression can be satisfied with this join for join in joinOps: if join.joinExpr: joinAttrs = ExpressionInfo(join.joinExpr).getAttributes() if self.contains(allAttrs, joinAttrs): currJoinExpr = join.joinExpr break else: # Should not involve hash-joins or index-joins (limitation) return plan if currJoinExpr is None: continue # Irrelevant join for joinMethod in ["nested-loops", "block-nested-loops"]: possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr)) possiblePlan.prepare(self.db) possiblePlan.sample(1.0) # Sampling causes too much overhead! cost = self.getPlanCost(plan) cost = possiblePlan.cost(estimated=True) if cost is None else cost self.addPlanCost(plan, cost) if minCost is None or cost < minCost: minCost = cost optPlan = possiblePlan optPlans[self.getJoinKey(possibleJoinOrder)] = None if optPlan is None else optPlan.root numTables = numTables + 1 firstOpWithJoins.subPlan = optPlans[self.getJoinKey(tableIds)] plan.prepare(self.db) return plan
def pickJoinOrder(self, plan): self.numPlansConsidered = 0 tableIds = list() joinOps = list() optPlans = dict() fields = dict() self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields) # Create worklist consisting of table IDs worklist = [str(tableId) for tableId in tableIds] # Now work our way down the 'worklist' greedily numTables = len(worklist) while numTables >= 2: # Choose the cheapest join that can be made over the remaining sub-plans minCost = None optPlan = None optLhsId = None optRhsId = None possibleJoinOrders = itertools.combinations(worklist, 2) for possibleJoinOrder in possibleJoinOrders: # Start examining each possible plan lhsId = possibleJoinOrder[0] rhsId = possibleJoinOrder[1] lhsOpt = optPlans[lhsId] if lhsId in optPlans else None rhsOpt = optPlans[rhsId] if rhsId in optPlans else None if lhsOpt is None or rhsOpt is None: continue # Skip irrelevant joins # This is to take care of multi-way joins added to worklist lhsIds = lhsId.split(",") rhsIds = rhsId.split(",") # Form a list of available attributes of this join allAttrs = list() for lId in lhsIds: allAttrs.extend(fields[int(lId)]) for rId in rhsIds: allAttrs.extend(fields[int(rId)]) currJoinExpr = None # Check whether any join expression can be satisfied with this join for join in joinOps: if join.joinExpr: joinAttrs = ExpressionInfo(join.joinExpr).getAttributes() if self.contains(allAttrs, joinAttrs): currJoinExpr = join.joinExpr break if currJoinExpr is None: continue # Skip irrelevant joins self.numPlansConsidered += 2 # Compare costs of different type of joins for joinMethod in ["nested-loops", "block-nested-loops"]: possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr)) possiblePlan.prepare(self.db) possiblePlan.sample(1.0) # Sampling causes too much overhead! cost = self.getPlanCost(plan) cost = possiblePlan.cost(estimated=True) if cost is None else cost self.addPlanCost(plan, cost) if minCost is None or cost < minCost: minCost = cost optPlan = possiblePlan optLhsId = lhsId optRhsId = rhsId # Switch left and right and compare again for joinMethod in ["nested-loops", "block-nested-loops"]: possiblePlan = Plan(root=Join(lhsPlan=rhsOpt, rhsPlan=lhsOpt, method=joinMethod, expr=currJoinExpr)) possiblePlan.prepare(self.db) possiblePlan.sample(1.0) # Sampling causes too much overhead! cost = self.getPlanCost(plan) cost = possiblePlan.cost(estimated=True) if cost is None else cost self.addPlanCost(plan, cost) if minCost is None or cost < minCost: minCost = cost optPlan = possiblePlan optLhsId = rhsId optRhsId = lhsId if optPlan is not None: # Update optimal plan joinKey = optLhsId + "," + optRhsId optPlans[joinKey] = optPlan.root # Update worklist worklist.remove(optLhsId) worklist.remove(optRhsId) worklist.append(joinKey) numTables = numTables - 1 # Return single plan left in worklist return optPlans[worklist[0]]
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 10 defaultPartiNumber = 5 # build join constraint list; joinExprs = self.decodeJoinExprs(operator) # build a local plan-cost dict: n = len(aPaths) # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' totalcost as 0. self.addPlanCost(aPath, (numPages, 0)) for i in range(1, n): for S in comb(aPaths, i + 1): for O in self.powerSet(S): # The following codes are added because some subPlans may # not be present in the self.statsCache as # 1) it was filtered out because it is a right-deep # 2) it has not any constraint associated. keyL = tuple(sorted(list(map(lambda x: x.id(), O)))) keyR = tuple( sorted( list( map(lambda x: x.id(), [ele for ele in S if ele not in O])))) planForO = None remindPl = None costL = None costR = None if keyL in self.statsCache and keyR in self.statsCache: (planForO, costL) = self.statsCache[tuple( sorted(list(map(lambda x: x.id(), O))))] (remindPl, costR) = self.statsCache[tuple( sorted( list( map(lambda x: x.id(), [ele for ele in S if ele not in O]))))] else: continue fields = self.joinable(joinExprs, [planForO, remindPl]) # If we detect constraints, we will create a new join from here. if fields is not None: lKeySchema = DBSchema( 'left', [(f, t) for (f, t) in planForO.schema().schema() if f == fields[0]]) rKeySchema = DBSchema( 'right', [(f, t) for (f, t) in remindPl.schema().schema() if f == fields[1]]) lHashFn = 'hash(' + fields[0] + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + fields[1] + ') % ' + str( defaultPartiNumber) newJoin = Join(planForO, remindPl, method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) if (i == 1) or (not self.isRightDeep(newJoin, aPaths)): newJoin.prepare(self.db) # Calculate output pages; cards = Plan( root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) # Calculate output costs: totalCost = costL[1] + costR[1] + 3 * (costL[0] + costR[0]) # Add new Join to self.statsCache self.addPlanCost(newJoin, (pages, totalCost)) return self.getPlanCost(operator)[0]
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() optDict = {} self.reportPlanCount = 0 for npass in range(1, len(relations) + 1): if npass == 1: for r in relations: table = TableScan(r,self.db.relationSchema(r)) if (r,) in selectTablesDict: selectExprs = selectTablesDict[(r,)] selectString = self.combineSelects(selectExprs) select = Select(table,selectString) optDict[(r,)] = Plan(root=select) else: optDict[(r,)] = Plan(root=table) self.reportPlanCount += 1 else: combinations = itertools.combinations(relations,npass) for c in combinations: fullList = sorted(c) clist = self.getCombos(fullList) bestJoin = None for subcombo in clist: complement = self.getComplement(fullList, subcombo) leftOps = optDict[tuple(complement)].root rightOps = optDict[tuple(subcombo)].root selectExpr = self.createExpression(complement, subcombo, selectTablesDict) joinExpr = self.createExpression(complement, subcombo, joinTablesDict) joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops" ) fullBnljOp = Select(joinBnljOp, selectExpr) if selectExpr == "True": joinBnlj = Plan(root=joinBnljOp) else: joinBnlj = Plan(root=fullBnljOp) joinBnlj.prepare(self.db) joinBnlj.sample(100) joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops" ) fullNljOp = Select(joinNljOp, selectExpr) if selectExpr == "True": joinNlj = Plan(root=joinNljOp) else: joinNlj = Plan(root=fullNljOp) joinNlj.prepare(self.db) joinNlj.sample(100) if joinBnlj.cost(True) < joinNlj.cost(True): if bestJoin == None or joinBnlj.cost(True) < bestJoin.cost(True): bestJoin = joinBnlj else: if bestJoin == None or joinNlj.cost(True) < bestJoin.cost(True): bestJoin = joinNlj self.reportPlanCount += 2 self.clearSampleFiles() optDict[tuple(fullList)] = bestJoin # after System R algorithm newPlan = optDict[tuple(sorted(relations))] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r,self.db.relationSchema(r)) table.prepare(self.db) if (r,) in selectTablesDict: selectExprs = selectTablesDict[(r,)] selectString = self.combineSelects(selectExprs) select = Select(table,selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while(len(worklist) > 1): combos = itertools.combinations(worklist,2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops" ) join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops" ) join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops" ) join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops" ) if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost(True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def projectPushDown(self, plan): root = plan.root result = [] #Keep info in the form (current op, parent, accumulate Porject) queue = deque([(root, None, None)]) while queue: (curr, parent, accuProject) = queue.popleft() children = curr.inputs() if children: #Add current project into accumulate project if isinstance(curr, Project): if not accuProject: accuProject = curr.projectExprs else: accuProject.update({curr.projectExprs}) queue.extendleft([(children[0], curr, accuProject)]) elif isinstance(curr, Select): newProject = None if accuProject: selectAttrs = ExpressionInfo(curr.selectExpr).getAttributes() projectAttrs = self.getProjectAttrs(accuProject) newProject = Project(curr, accuProject) if set(selectAttrs).issubset(set(projectAttrs)): result.append((curr, parent)) queue.extendleft([(children[0], curr, accuProject)]) ''' #If considering the order of select and project: #Project can go through select #but if the selectivity of select is smaller, we do not let project pass curr.useSampling(sampled=True, sampleFactor=10.0) newProject.useSampling(sampled=True, sampleFactor=10.0) if curr.selectivity(estimated=True) < newProject.selectivity(estimated=True): result.append((newProject, parent)) result.append((curr, newProject)) queue.extendleft([(children[0], curr, None)]) else: result.append((curr, parent)) queue.extendleft([(children[0], curr, accuProject)]) ''' #If select operation has attributes that don't belongs to project #project has to stop here else: result.append((newProject, parent)) result.append((curr, newProject)) queue.extendleft([(children[0], curr, None)]) else: result.append((curr, parent)) queue.extendleft([(children[0], curr, accuProject)]) elif isinstance(curr, Join): #If we don't decompose project if accuProject: newProject = Project(curr, accuProject) result.append((newProject, parent)) result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, None)]) queue.extendleft([(curr.rhsPlan, curr, None)]) ''' #This part can be used to decompose project operation leftProject = {} rightProject = {} newProject = None leftFields = curr.lhsSchema.fields rightFields = curr.rhsSchema.fields put = {} if accuProject: projectAttrs = self.getProjectAttrs(accuProject) joinAttrs = ExpressionInfo(curr.joinExpr).getAttributes() if set(joinAttrs).issubset(set(projectAttrs)): for (k,v) in accuProject.items(): flag = False f = ExpressionInfo(k).getAttributes() if set(f).issubset(set(leftFields)): leftProject.update({k: v}) flag = True if set(f).issubset(set(rightFields)): rightProject.update({k: v}) flag = True if not flag: put.update({k: v}) if put: newProject = Project(curr, put) result.append((newProject, parent)) else: newProject = Project(curr, accuProject) result.append((newProject, parent)) if newProject: result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftProject)]) queue.extendleft([(curr.rhsPlan, curr, rightProject)]) ''' elif isinstance(curr, GroupBy): newProject = None if accuProject: newProject = Project(curr, accuProject) result.append((newProject, parent)) if newProject: result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(children[0], curr, None)]) else: #If we don't decompose project if accuProject: newProject = Project(curr, accuProject) result.append((newProject, parent)) result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, None)]) queue.extendleft([(curr.rhsPlan, curr, None)]) ''' #This part can be used to decompose project leftProject = {} rightProject = {} newProject = None attrs = curr.unionSchema.fields put = {} if accuProject: projectAttrs = self.getProjectAttrs(accuProject) if set(attrs).issubset(set(projectAttrs)): leftProject = accuProject rightProject = accuProject else: newProject = Project(curr, accuProject) result.append((newProject, parent)) if newProject: result.append((curr, newProject)) else: result.append((curr, parent)) queue.extendleft([(curr.lhsPlan, curr, leftProject)]) queue.extendleft([(curr.rhsPlan, curr, rightProject)]) ''' else: newProject = None if accuProject: newProject = Project(curr, accuProject) if newProject: result.append((newProject, parent)) result.append((curr, newProject)) else: result.append((curr, parent)) newRoot = result[0][0] return Plan(root=newRoot)
def pushdownOperators(self, plan): #raise NotImplementedError if plan: planRoot = self.pushdownHelper(plan.root) return Plan(root=planRoot)