def get_best_join(self, candidates, required_joins): best_plan_cost = None best_plan = None for left, right in candidates: relevant_expr = None # Find the joinExpr corresponding to the current join candidate. If there is none, it's a # cartesian product. for join in required_joins: names = ExpressionInfo(join.joinExpr).getAttributes() if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names): relevant_expr = join.joinExpr break else: relevant_expr = 'True' # Construct a join plan for the current candidate, for each possible join algorithm. # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods. for algorithm in ["nested-loops", "block-nested-loops"]: test_plan = Plan(root = Join( lhsPlan = left, rhsPlan = right, method = algorithm, expr = relevant_expr )) # Prepare and run the plan in sampling mode, and get the estimated cost. test_plan.prepare(self.db) test_plan.sample(1.0) cost = test_plan.cost(estimated = True) # Update running best. if best_plan_cost is None or cost < best_plan_cost: best_plan_cost = cost best_plan = test_plan # Need to return the root operator rather than the plan itself, since it's going back into the # table. return best_plan.root
def get_best_join(self, candidates, required_joins): best_plan_cost = None best_plan = None for left, right in candidates: relevant_expr = None # Find the joinExpr corresponding to the current join candidate. If there is none, it's a # cartesian product. for join in required_joins: names = ExpressionInfo(join.joinExpr).getAttributes() if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names): relevant_expr = join.joinExpr break else: relevant_expr = 'True' # Construct a join plan for the current candidate, for each possible join algorithm. # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods. for algorithm in ["nested-loops", "block-nested-loops", "hash"]: test_plan = Plan(root = Join( lhsPlan = left, rhsPlan = right, method = algorithm, expr = relevant_expr )) # Prepare and run the plan in sampling mode, and get the estimated cost. test_plan.prepare(self.db) test_plan.sample(5.0) cost = test_plan.cost(estimated = True) # Update running best. if best_plan_cost is None or cost < best_plan_cost: best_plan_cost = cost best_plan = test_plan # Need to return the root operator rather than the plan itself, since it's going back into the # table. return best_plan.root
def pickJoinOrder(self, plan): joins, tableIDs, optimalSubPlans, fields, nubPlan = self.optimizerSetup( plan) # Joins is a list of joins # TableIDs is a list of the operator on top of a tableScan or the scan itself (Select, Projcect) # optimalSubPlan is a dictionary where the key is the top operator ID (from TableID) and val is the operator # fields is a dictionary where key is top operator ID, val is the dictionary of fields if len(joins) == 0: return plan numTables = 2 while numTables <= len(tableIDs): print('NumTables: ', numTables) joinOrderings = itertools.combinations(tableIDs, numTables) # Check each ordering, check each join method # Start with two tables total # pick one as the LHS, one as the RHS for joinOrdering in joinOrderings: # This iterates through subsets of size numTables bestCost = 1e99 bestPlan = None for rhsID in joinOrdering: # Eventually we'll even iterate through swapping 2-joins lhsIDs = list(joinOrdering) lhsIDs.remove(rhsID) # Make this one the right side join lhsKey = frozenset(lhsIDs) # Key for optimalSubPlan dict rhsKey = frozenset([rhsID]) # Key for optimalSubPlan dict cachedLHS = optimalSubPlans[ lhsKey] if lhsKey in optimalSubPlans else None # Get the optimal subPlan cachedRHS = optimalSubPlans[ rhsKey] # Get the optimal subPlan if cachedLHS is None or cachedRHS is None: continue # Do we even care about doing this join? allAttributes = [] for lhsID in lhsIDs: allAttributes.extend(fields[frozenset([ lhsID ])]) # These are all the attributes in the join allAttributes.extend(fields[rhsKey]) contains, planDict = self.checkJoins( joins, allAttributes, cachedLHS, cachedRHS) # print('Got to Contains') if contains: # print('PlanDict: ', planDict) for joinMethod in [ "hash", "nested-loops", "block-nested-loops" ]: if joinMethod == "hash": # print('HashMethod') lhsPlan = cachedLHS rhsPlan = cachedRHS lhsHashFn = planDict['lhsHashFn'] rhsHashFn = planDict['rhsHashFn'] lhsKeySchema = planDict['lhsKeySchema'] rhsKeySchema = planDict['rhsKeySchema'] tryPlan = Plan( root=Join(method=joinMethod, lhsPlan=cachedLHS, lhsHashFn=lhsHashFn, lhsKeySchema=lhsKeySchema, rhsPlan=cachedRHS, rhsHashFn=rhsHashFn, rhsKeySchema=rhsKeySchema)) self.checkPlan = tryPlan tryPlan.prepare(self.db) tryPlan.sample(1.0) cost = tryPlan.cost(estimated=True) # print('HashCost: ', cost) else: # print(joinMethod) joinExpr = planDict['joinExpr'] tryPlan = Plan(root=Join(lhsPlan=cachedLHS, rhsPlan=cachedRHS, method=joinMethod, expr=joinExpr)) self.checkPlan = tryPlan tryPlan.prepare(self.db) tryPlan.sample(1.0) cost = tryPlan.cost(estimated=True) # print(joinMethod + ' Cost: ', cost) if cost < bestCost: bestCost = cost bestPlan = tryPlan newKey = frozenset(joinOrdering) self.addPlanCost(newKey, bestCost, bestPlan) optimalSubPlans[ newKey] = bestPlan.root if bestPlan is not None else None numTables += 1 nubPlan.subPlan = self.statsCache[frozenset(tableIDs)][1].root plan.prepare(self.db) return plan
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r,self.db.relationSchema(r)) table.prepare(self.db) if (r,) in selectTablesDict: selectExprs = selectTablesDict[(r,)] selectString = self.combineSelects(selectExprs) select = Select(table,selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while(len(worklist) > 1): combos = itertools.combinations(worklist,2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops" ) join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops" ) join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops" ) join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops" ) if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost(True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() optDict = {} self.reportPlanCount = 0 for npass in range(1, len(relations) + 1): if npass == 1: for r in relations: table = TableScan(r,self.db.relationSchema(r)) if (r,) in selectTablesDict: selectExprs = selectTablesDict[(r,)] selectString = self.combineSelects(selectExprs) select = Select(table,selectString) optDict[(r,)] = Plan(root=select) else: optDict[(r,)] = Plan(root=table) self.reportPlanCount += 1 else: combinations = itertools.combinations(relations,npass) for c in combinations: fullList = sorted(c) clist = self.getCombos(fullList) bestJoin = None for subcombo in clist: complement = self.getComplement(fullList, subcombo) leftOps = optDict[tuple(complement)].root rightOps = optDict[tuple(subcombo)].root selectExpr = self.createExpression(complement, subcombo, selectTablesDict) joinExpr = self.createExpression(complement, subcombo, joinTablesDict) joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops" ) fullBnljOp = Select(joinBnljOp, selectExpr) if selectExpr == "True": joinBnlj = Plan(root=joinBnljOp) else: joinBnlj = Plan(root=fullBnljOp) joinBnlj.prepare(self.db) joinBnlj.sample(100) joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops" ) fullNljOp = Select(joinNljOp, selectExpr) if selectExpr == "True": joinNlj = Plan(root=joinNljOp) else: joinNlj = Plan(root=fullNljOp) joinNlj.prepare(self.db) joinNlj.sample(100) if joinBnlj.cost(True) < joinNlj.cost(True): if bestJoin == None or joinBnlj.cost(True) < bestJoin.cost(True): bestJoin = joinBnlj else: if bestJoin == None or joinNlj.cost(True) < bestJoin.cost(True): bestJoin = joinNlj self.reportPlanCount += 2 self.clearSampleFiles() optDict[tuple(fullList)] = bestJoin # after System R algorithm newPlan = optDict[tuple(sorted(relations))] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): self.numPlansConsidered = 0 tableIds = list() joinOps = list() optPlans = dict() fields = dict() self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields) # Create worklist consisting of table IDs worklist = [str(tableId) for tableId in tableIds] # Now work our way down the 'worklist' greedily numTables = len(worklist) while numTables >= 2: # Choose the cheapest join that can be made over the remaining sub-plans minCost = None optPlan = None optLhsId = None optRhsId = None possibleJoinOrders = itertools.combinations(worklist, 2) for possibleJoinOrder in possibleJoinOrders: # Start examining each possible plan lhsId = possibleJoinOrder[0] rhsId = possibleJoinOrder[1] lhsOpt = optPlans[lhsId] if lhsId in optPlans else None rhsOpt = optPlans[rhsId] if rhsId in optPlans else None if lhsOpt is None or rhsOpt is None: continue # Skip irrelevant joins # This is to take care of multi-way joins added to worklist lhsIds = lhsId.split(",") rhsIds = rhsId.split(",") # Form a list of available attributes of this join allAttrs = list() for lId in lhsIds: allAttrs.extend(fields[int(lId)]) for rId in rhsIds: allAttrs.extend(fields[int(rId)]) currJoinExpr = None # Check whether any join expression can be satisfied with this join for join in joinOps: if join.joinExpr: joinAttrs = ExpressionInfo(join.joinExpr).getAttributes() if self.contains(allAttrs, joinAttrs): currJoinExpr = join.joinExpr break if currJoinExpr is None: continue # Skip irrelevant joins self.numPlansConsidered += 2 # Compare costs of different type of joins for joinMethod in ["nested-loops", "block-nested-loops"]: possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr)) possiblePlan.prepare(self.db) possiblePlan.sample(1.0) # Sampling causes too much overhead! cost = self.getPlanCost(plan) cost = possiblePlan.cost(estimated=True) if cost is None else cost self.addPlanCost(plan, cost) if minCost is None or cost < minCost: minCost = cost optPlan = possiblePlan optLhsId = lhsId optRhsId = rhsId # Switch left and right and compare again for joinMethod in ["nested-loops", "block-nested-loops"]: possiblePlan = Plan(root=Join(lhsPlan=rhsOpt, rhsPlan=lhsOpt, method=joinMethod, expr=currJoinExpr)) possiblePlan.prepare(self.db) possiblePlan.sample(1.0) # Sampling causes too much overhead! cost = self.getPlanCost(plan) cost = possiblePlan.cost(estimated=True) if cost is None else cost self.addPlanCost(plan, cost) if minCost is None or cost < minCost: minCost = cost optPlan = possiblePlan optLhsId = rhsId optRhsId = lhsId if optPlan is not None: # Update optimal plan joinKey = optLhsId + "," + optRhsId optPlans[joinKey] = optPlan.root # Update worklist worklist.remove(optLhsId) worklist.remove(optRhsId) worklist.append(joinKey) numTables = numTables - 1 # Return single plan left in worklist return optPlans[worklist[0]]
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r, self.db.relationSchema(r)) table.prepare(self.db) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while (len(worklist) > 1): combos = itertools.combinations(worklist, 2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops") join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops") join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops") join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops") if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost( True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() optDict = {} self.reportPlanCount = 0 for npass in range(1, len(relations) + 1): if npass == 1: for r in relations: table = TableScan(r, self.db.relationSchema(r)) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) optDict[(r, )] = Plan(root=select) else: optDict[(r, )] = Plan(root=table) self.reportPlanCount += 1 else: combinations = itertools.combinations(relations, npass) for c in combinations: fullList = sorted(c) clist = self.getCombos(fullList) bestJoin = None for subcombo in clist: complement = self.getComplement(fullList, subcombo) leftOps = optDict[tuple(complement)].root rightOps = optDict[tuple(subcombo)].root selectExpr = self.createExpression( complement, subcombo, selectTablesDict) joinExpr = self.createExpression( complement, subcombo, joinTablesDict) joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops") fullBnljOp = Select(joinBnljOp, selectExpr) if selectExpr == "True": joinBnlj = Plan(root=joinBnljOp) else: joinBnlj = Plan(root=fullBnljOp) joinBnlj.prepare(self.db) joinBnlj.sample(100) joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops") fullNljOp = Select(joinNljOp, selectExpr) if selectExpr == "True": joinNlj = Plan(root=joinNljOp) else: joinNlj = Plan(root=fullNljOp) joinNlj.prepare(self.db) joinNlj.sample(100) if joinBnlj.cost(True) < joinNlj.cost(True): if bestJoin == None or joinBnlj.cost( True) < bestJoin.cost(True): bestJoin = joinBnlj else: if bestJoin == None or joinNlj.cost( True) < bestJoin.cost(True): bestJoin = joinNlj self.reportPlanCount += 2 self.clearSampleFiles() optDict[tuple(fullList)] = bestJoin # after System R algorithm newPlan = optDict[tuple(sorted(relations))] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): # Some restrictions apply: # 1. Cannot involve hash-joins or index-joins # 2. Only join operations beyond certain point (i.e. cannot have join -> aggregation -> join, etc.) tableIds = list() joinOps = list() optPlans = dict() fields = dict() firstOpWithJoins = self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields) if len(joinOps) == 0: return plan numTables = 2 while numTables <= len(tableIds): possibleJoinOrders = itertools.combinations(tableIds, numTables) for possibleJoinOrder in possibleJoinOrders: minCost = None optPlan = None for tableId in possibleJoinOrder: # Left-deep-only optimizer (i.e. rhs operand is a base relation) lhsIds = list(possibleJoinOrder) lhsIds.remove(tableId) lhsJoinKey = self.getJoinKey(lhsIds) rhsJoinKey = str(tableId) lhsOpt = optPlans[lhsJoinKey] if lhsJoinKey in optPlans else None rhsOpt = optPlans[rhsJoinKey] if rhsJoinKey in optPlans else None if lhsOpt is None or rhsOpt is None: continue # Skip irrelevant joins # Form a list of available attributes of this join allAttrs = list() for lhsId in lhsIds: allAttrs.extend(fields[lhsId]) allAttrs.extend(fields[tableId]) currJoinExpr = None # Check whether any join expression can be satisfied with this join for join in joinOps: if join.joinExpr: joinAttrs = ExpressionInfo(join.joinExpr).getAttributes() if self.contains(allAttrs, joinAttrs): currJoinExpr = join.joinExpr break else: # Should not involve hash-joins or index-joins (limitation) return plan if currJoinExpr is None: continue # Irrelevant join for joinMethod in ["nested-loops", "block-nested-loops"]: possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr)) possiblePlan.prepare(self.db) possiblePlan.sample(1.0) # Sampling causes too much overhead! cost = self.getPlanCost(plan) cost = possiblePlan.cost(estimated=True) if cost is None else cost self.addPlanCost(plan, cost) if minCost is None or cost < minCost: minCost = cost optPlan = possiblePlan optPlans[self.getJoinKey(possibleJoinOrder)] = None if optPlan is None else optPlan.root numTables = numTables + 1 firstOpWithJoins.subPlan = optPlans[self.getJoinKey(tableIds)] plan.prepare(self.db) return plan