def processJoin(self, leftPlan, leftRels): newCost = None newJoin = None newRels = None for right in [frozenset({right}) for right in self.rels - leftRels]: value = self.joinMap[right] self.combsTried += 1 union = leftRels.union(right) if not value: continue else: for tuple in value: if not tuple[0].issubset(union): continue self.plansProcessed += 1 newJoin = Join(leftPlan, self.tableScans[right], expr=tuple[1].joinExpr, method="block-nested-loops") newJoin.prepare(self.db) newCost = newJoin.cost(estimated=True) newRels = frozenset(leftRels.union(right)) return (newCost, newJoin, newRels)
def pickJoinOrder(self, plan): rels = set(plan.relations()) optPlans = {} #Map a set of relations to the optimized plan #toBeProcessed = [] #Set of relations pending processing self.combsTried = 0 self.plansProcessed = 0 for r in rels: set_r = frozenset({r}) #toBeProcessed.append(set_r) newScan = TableScan(r, self.db.relationSchema(r)) newScan.prepare(self.db) optPlans[set_r] = newScan #For each join operator, fetch its relative relations #Map a set of relations to (relative relations, operator) joinMap = {} for (_, op) in plan.flatten(): if isinstance(op, Join): relativeR = self.relativeRelations(rels, op) for r in [frozenset({r}) for r in relativeR]: if r in joinMap.keys(): joinMap[r].append((relativeR, op)) else: joinMap[r] = [(relativeR, op)] n = len(rels) for i in range(2, n + 1): for union in [frozenset(union) for union in self.kRelsComb(i, rels)]: for right in [frozenset(right) for right in self.kRelsComb(1, union)]: left = frozenset(union - right) for t in left: self.combsTried += 1 value = joinMap[frozenset({t})] if not value: continue else: for tuple in value: if not (set(tuple[0]).issubset(union) and left in optPlans and right in optPlans): continue self.plansProcessed += 1 newJoin = Join(optPlans[left], optPlans[right], expr=tuple[1].joinExpr, method="block-nested-loops") newJoin.prepare(self.db) if not union in optPlans: optPlans[union] = newJoin self.addPlanCost(newJoin, newJoin.cost(estimated=True)) else: formerCost = self.getPlanCost(optPlans[union]) if newJoin.cost(estimated=True) < formerCost: optPlans[union] = newJoin self.addPlanCost(newJoin, newJoin.cost(estimated=True)) newRoot = optPlans[frozenset(rels)] return Plan(root=newRoot) '''
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 10 defaultPartiNumber = 5 n = len(aPaths) planList = [] costList = [] # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' total cost as 0. planList.append(aPath) costList.append((numPages, 0)) # i = 2...n for i in range(1, n): # find all possible two way join in current planList # put the potential joins in potentialP # put the potential joins cost in potentialC m = len(planList) potentialP = [] potentialC = [] for j in range(0, m - 1): for k in range(j + 1, m): self.pcntr += 1 potentialP.append((planList[j], planList[k])) potentialC.append(3 * (costList[j][0] + costList[k][0]) + costList[j][1] + costList[k][1]) # find the cheapest joinable join (total cost) # build the join, remove the used two base plan and add the new join to planList # modify the costList as well while (potentialC): currC = min(potentialC) currP = potentialP[potentialC.index(currC)] potentialC.remove(currC) potentialP.remove(currP) if (self.joinable(operator, currP)): (lField, rField) = self.joinable(operator, currP) lhsSchema = currP[0].schema() rhsSchema = currP[1].schema() lKeySchema = DBSchema( 'left', [(f, t) for (f, t) in lhsSchema.schema() if f == lField]) rKeySchema = DBSchema( 'right', [(f, t) for (f, t) in rhsSchema.schema() if f == rField]) lHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) newJoin = Join(currP[0], currP[1], method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) newJoin.prepare(self.db) totalCost = currC cards = Plan(root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) id1 = planList.index(currP[0]) _ = planList.pop(id1) id2 = planList.index(currP[1]) _ = planList.pop(id2) planList.append(newJoin) _ = costList.pop(id1) _ = costList.pop(id2) costList.append((pages, totalCost)) break print("GreedyOptimizer plan considered: ", self.pcntr) return planList[0]
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 50 defaultPartiNumber = 5 # build join constraint list; joinExprs = self.decodeJoinExprs(operator) # build a local plan-cost dict: prev = dict() curr = dict() n = len(aPaths) # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' totalcost as 0. self.addPlanCost(aPath, (numPages, 0)) prev[aPath] = (numPages, 0) # i = 2...n for i in range(1, n): # build current list with prev. # For 2-way joins, we don't need to care left deep plan for p in prev.keys(): accP = self.allAccessPaths(p) remL = [item for item in aPaths if item not in accP] for base in remL: lhsSchema = p.schema() rhsSchema = base.schema() newJoin = None (sCostL, tCostL) = prev[p] (rPlan, costR) = self.getPlanCost(base) # Here we are using System-R 's heuristic to eliminate permutations as # much as possible. # Reference: Selinger, 1979, http://www.cs.berkeley.edu/~brewer/cs262/3-selinger79.pdf for (lField, rField) in joinExprs: if lField in lhsSchema.fields and rField in rhsSchema.fields: # Build Join # We only select hashjoin for building join plans # This is because the nested-loop-join contains a bug lKeySchema = DBSchema('left', [ (f, t) for (f, t) in lhsSchema.schema() if f == lField ]) rKeySchema = DBSchema('right', [ (f, t) for (f, t) in rhsSchema.schema() if f == rField ]) lHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) newJoin = Join(p, rPlan, method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) elif lField in rhsSchema.fields and rField in lhsSchema.fields: # Build Join # We only select hashjoin for building join plans # This is because the nested-loop-join contains a bug lKeySchema = DBSchema('left', [ (f, t) for (f, t) in rhsSchema.schema() if f == lField ]) rKeySchema = DBSchema('right', [ (f, t) for (f, t) in lhsSchema.schema() if f == rField ]) lHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) newJoin = Join(p, rPlan, method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=rKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=lKeySchema) else: continue if newJoin is not None: # Let's push newJoin onto the cache and curr list # cost: 3(M+N) + M's totalcost # then we renew newJoin's stepcost newJoin.prepare(self.db) stepCost = 3 * (sCostL + costR[0]) totalCost = stepCost + tCostL cards = Plan( root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) self.addPlanCost(newJoin, (pages, totalCost)) curr[newJoin] = (pages, totalCost) prev = curr curr = dict() del prev, curr return self.getPlanCost(operator)[0]
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 10 defaultPartiNumber = 5 # build join constraint list; joinExprs = self.decodeJoinExprs(operator) # build a local plan-cost dict: n = len(aPaths) # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' totalcost as 0. self.addPlanCost(aPath, (numPages, 0)) for i in range(1, n): for S in comb(aPaths, i + 1): for O in self.powerSet(S): # The following codes are added because some subPlans may # not be present in the self.statsCache as # 1) it was filtered out because it is a right-deep # 2) it has not any constraint associated. keyL = tuple(sorted(list(map(lambda x: x.id(), O)))) keyR = tuple( sorted( list( map(lambda x: x.id(), [ele for ele in S if ele not in O])))) planForO = None remindPl = None costL = None costR = None if keyL in self.statsCache and keyR in self.statsCache: (planForO, costL) = self.statsCache[tuple( sorted(list(map(lambda x: x.id(), O))))] (remindPl, costR) = self.statsCache[tuple( sorted( list( map(lambda x: x.id(), [ele for ele in S if ele not in O]))))] else: continue fields = self.joinable(joinExprs, [planForO, remindPl]) # If we detect constraints, we will create a new join from here. if fields is not None: lKeySchema = DBSchema( 'left', [(f, t) for (f, t) in planForO.schema().schema() if f == fields[0]]) rKeySchema = DBSchema( 'right', [(f, t) for (f, t) in remindPl.schema().schema() if f == fields[1]]) lHashFn = 'hash(' + fields[0] + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + fields[1] + ') % ' + str( defaultPartiNumber) newJoin = Join(planForO, remindPl, method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) if (i == 1) or (not self.isRightDeep(newJoin, aPaths)): newJoin.prepare(self.db) # Calculate output pages; cards = Plan( root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) # Calculate output costs: totalCost = costL[1] + costR[1] + 3 * (costL[0] + costR[0]) # Add new Join to self.statsCache self.addPlanCost(newJoin, (pages, totalCost)) return self.getPlanCost(operator)[0]