def pickJoinOrder(self, plan): rels = set(plan.relations()) optPlans = {} #Map a set of relations to the optimized plan #toBeProcessed = [] #Set of relations pending processing self.combsTried = 0 self.plansProcessed = 0 for r in rels: set_r = frozenset({r}) #toBeProcessed.append(set_r) newScan = TableScan(r, self.db.relationSchema(r)) newScan.prepare(self.db) optPlans[set_r] = newScan #For each join operator, fetch its relative relations #Map a set of relations to (relative relations, operator) joinMap = {} for (_, op) in plan.flatten(): if isinstance(op, Join): relativeR = self.relativeRelations(rels, op) for r in [frozenset({r}) for r in relativeR]: if r in joinMap.keys(): joinMap[r].append((relativeR, op)) else: joinMap[r] = [(relativeR, op)] n = len(rels) for i in range(2, n + 1): for union in [frozenset(union) for union in self.kRelsComb(i, rels)]: for right in [frozenset(right) for right in self.kRelsComb(1, union)]: left = frozenset(union - right) for t in left: self.combsTried += 1 value = joinMap[frozenset({t})] if not value: continue else: for tuple in value: if not (set(tuple[0]).issubset(union) and left in optPlans and right in optPlans): continue self.plansProcessed += 1 newJoin = Join(optPlans[left], optPlans[right], expr=tuple[1].joinExpr, method="block-nested-loops") newJoin.prepare(self.db) if not union in optPlans: optPlans[union] = newJoin self.addPlanCost(newJoin, newJoin.cost(estimated=True)) else: formerCost = self.getPlanCost(optPlans[union]) if newJoin.cost(estimated=True) < formerCost: optPlans[union] = newJoin self.addPlanCost(newJoin, newJoin.cost(estimated=True)) newRoot = optPlans[frozenset(rels)] return Plan(root=newRoot) '''
def pickJoinOrder(self, plan): self.combsTried = 0 self.plansProcessed = 0 self.rels = set(plan.relations()) #toBeProcessed = set() self.tableScans = {} for r in self.rels: ts = TableScan(r, self.db.relationSchema(r)) ts.prepare(self.db) self.tableScans[frozenset({r})] = ts self.joinMap = {} for (_, op) in plan.flatten(): if isinstance(op, Join): relativeR = self.relativeRelations(self.rels, op) for r in [frozenset({r}) for r in relativeR]: if r in self.joinMap.keys(): self.joinMap[r].append((relativeR, op)) else: self.joinMap[r] = [(relativeR, op)] n = len(self.rels) currBestPlan = None formerBestPlan = None formerRels = None currRels = None for i in range(2, n + 1): currBestCost = float('inf') if i == 2: for left in [frozenset({left}) for left in self.rels]: (newCost, newJoin, newRels) = self.processJoin(self.tableScans[left], left) if newCost < currBestCost: currRels = newRels currBestPlan = newJoin currBestCost = newCost else: (newCost, newJoin, newRels) = self.processJoin(formerBestPlan, formerRels) if newCost < currBestCost: currRels = newRels currBestPlan = newJoin currBestCost = newCost formerBestPlan = currBestPlan currBestPlan = None formerRels = currRels currRels = None newRoot = formerBestPlan return Plan(root=newRoot)
def fromTable(self, relId): if self.database: schema = self.database.relationSchema(relId) return PlanBuilder(operator=TableScan(relId, schema))
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r,self.db.relationSchema(r)) table.prepare(self.db) if (r,) in selectTablesDict: selectExprs = selectTablesDict[(r,)] selectString = self.combineSelects(selectExprs) select = Select(table,selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while(len(worklist) > 1): combos = itertools.combinations(worklist,2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops" ) join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops" ) join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops" ) join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops" ) if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost(True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def hashJoin(self): if self.joinExpr == None: self.joinExpr = self.lhsKeySchema.fields[0] + "==" + self.rhsKeySchema.fields[0]; self.tmpFilesL = list(); self.tmpFilesR = list(); bufPool = self.storage.bufferPool; self.logger("start..."); self.cleanBufferPool(bufPool); tmpFilesL = dict(); tmpFilesR = dict(); self.logger("building L partition"); for (PageId, Page) in iter(self.lhsPlan): self.buildPartitionL(PageId, Page, tmpFilesL); self.logger("building R partition"); for (PageId, Page) in iter(self.rhsPlan): self.buildPartitionR(PageId, Page, tmpFilesR); # Schema prep lSchema = self.inputSchemas()[0]; rSchema = self.inputSchemas()[1]; for relIdLKey in tmpFilesL.keys(): # Clean up before running. if relIdLKey in tmpFilesR: (_, relIdTmpR) = tmpFilesR[ relIdLKey ]; (_, relIdTmpL) = tmpFilesL[ relIdLKey ]; else: continue; self.cleanBufferPool( bufPool ); lhsPlan = TableScan(relIdTmpL, self.inputSchemas()[0]); rhsPlan = TableScan(relIdTmpR, self.inputSchemas()[1]); lhsPlan.storage = self.storage; rhsPlan.storage = self.storage; self.lhsPlan = lhsPlan; self.rhsPlan = rhsPlan; for lPageId in pageBlock: lhsPage = bufPool.getPage(lPageId); for ltuple in iter( lhsPage ): tupleObj = lSchema.unpack( ltuple ); key = lSchema.project( tupleObj, self.lhsKeySchema )[0]; if key in hasher: hasher[ key ].append( ltuple ); else: hasher[ key ] = [ ltuple ]; # iterating all rtuples to pack output for (rPageId, rhsPage) in iter(rhsPlan): print( rPageId.pageIndex ); for rTuple in iter( rhsPage ): tupleObj = rSchema.unpack( rTuple ); print( tupleObj ); key = rSchema.project( tupleObj, self.rhsKeySchema )[0]; if key in hasher: for lTuple in hasher[ key ]: joinIns = self.loadSchema( lSchema, lTuple ) joinIns.update( self.loadSchema( rSchema, rTuple ) ); outputTuple = self.joinSchema.instantiate(*[joinIns[f] for f in self.joinSchema.fields]); print( outputTuple ); outputTupleP = self.joinSchema.pack(outputTuple); self.storage.fileMgr.relationFile(self.relationId())[1].insertTuple(outputTupleP); for lPageId in pageBlock: bufPool.unpinPage(lPageId); bufPool.discardPage(lPageId); self.cleanBufferPool(bufPool); del hasher;
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() self.reportPlanCount = 0 worklist = [] for r in relations: table = TableScan(r, self.db.relationSchema(r)) table.prepare(self.db) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) select.prepare(self.db) worklist.append(Plan(root=select)) else: worklist.append(Plan(root=table)) while (len(worklist) > 1): combos = itertools.combinations(worklist, 2) bestJoin = None sourcePair = None for pair in combos: op1 = pair[0].root op2 = pair[1].root selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict) joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict) join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops") join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops") join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops") join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops") if selectExpr == "True": full1BnljOp = join1BnljOp full2BnljOp = join2BnljOp full1NljOp = join1NljOp full2NljOp = join2NljOp else: full1BnljOp = Select(join1BnljOp, selectExpr) full2BnljOp = Select(join2BnljOp, selectExpr) full1NljOp = Select(join1NljOp, selectExpr) full2NljOp = Select(join2NljOp, selectExpr) joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp] for j in joinList: joinplan = Plan(root=j) joinplan.prepare(self.db) joinplan.sample(100) if bestJoin == None or joinplan.cost(True) < bestJoin.cost( True): bestJoin = joinplan sourcePair = pair self.reportPlanCount += 4 self.clearSampleFiles() worklist.remove(sourcePair[0]) worklist.remove(sourcePair[1]) worklist.append(bestJoin) # after System R algorithm newPlan = worklist[0] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan
def pickJoinOrder(self, plan): relations = plan.relations() fieldDict = self.obtainFieldDict(plan) (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict) # makes dicts that maps a list of relations to exprs involving that list # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C # and on top of it the select exprs that involve 2 tables A,C or B,C isGroupBy = True if plan.root.operatorType() == "GroupBy" else False outputSchema = plan.schema() optDict = {} self.reportPlanCount = 0 for npass in range(1, len(relations) + 1): if npass == 1: for r in relations: table = TableScan(r, self.db.relationSchema(r)) if (r, ) in selectTablesDict: selectExprs = selectTablesDict[(r, )] selectString = self.combineSelects(selectExprs) select = Select(table, selectString) optDict[(r, )] = Plan(root=select) else: optDict[(r, )] = Plan(root=table) self.reportPlanCount += 1 else: combinations = itertools.combinations(relations, npass) for c in combinations: fullList = sorted(c) clist = self.getCombos(fullList) bestJoin = None for subcombo in clist: complement = self.getComplement(fullList, subcombo) leftOps = optDict[tuple(complement)].root rightOps = optDict[tuple(subcombo)].root selectExpr = self.createExpression( complement, subcombo, selectTablesDict) joinExpr = self.createExpression( complement, subcombo, joinTablesDict) joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops") fullBnljOp = Select(joinBnljOp, selectExpr) if selectExpr == "True": joinBnlj = Plan(root=joinBnljOp) else: joinBnlj = Plan(root=fullBnljOp) joinBnlj.prepare(self.db) joinBnlj.sample(100) joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops") fullNljOp = Select(joinNljOp, selectExpr) if selectExpr == "True": joinNlj = Plan(root=joinNljOp) else: joinNlj = Plan(root=fullNljOp) joinNlj.prepare(self.db) joinNlj.sample(100) if joinBnlj.cost(True) < joinNlj.cost(True): if bestJoin == None or joinBnlj.cost( True) < bestJoin.cost(True): bestJoin = joinBnlj else: if bestJoin == None or joinNlj.cost( True) < bestJoin.cost(True): bestJoin = joinNlj self.reportPlanCount += 2 self.clearSampleFiles() optDict[tuple(fullList)] = bestJoin # after System R algorithm newPlan = optDict[tuple(sorted(relations))] if isGroupBy: newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \ aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \ aggExprs=plan.root.aggExprs, \ groupHashFn=plan.root.groupHashFn) newGroupBy.prepare(self.db) newPlan = Plan(root=newGroupBy) if set(outputSchema.schema()) != set(newPlan.schema().schema()): projectDict = {} for f, t in outputSchema.schema(): projectDict[f] = (f, t) currRoot = newPlan.root project = Project(currRoot, projectDict) project.prepare(self.db) newPlan = Plan(root=project) return newPlan