コード例 #1
0
ファイル: Optimizer.py プロジェクト: Brinews/jacky
	def pickJoinOrder(self, plan):
		# Extract all base relations, along with any unary operators immediately above.
		base_relations = set(plan.root.inputs())

		# Extract all joins in original plan, they serve as the set of joins actually necessary.
		joins = plan.root.joins()

		# Define the dynamic programming table.
		optimal_plans = {}

		# Establish optimal access paths.
		for relation in base_relations:
			optimal_plans[frozenset((relation,))] = relation

		# Fill in the table.
		for i in range(2, len(base_relations) + 1):
			for subset in itertools.combinations(base_relations, i):

				# Build the set of candidate joins.
				candidate_joins = set()
				for candidate_relation in subset:
					candidate_joins.add((
						optimal_plans[frozenset(tuple_without(subset, candidate_relation))],
						optimal_plans[frozenset((candidate_relation,))]
					))

				# Find the best of the candidate joins.
				optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins)

		# Reconstruct the best plan, prepare and return.
		final_plan = Plan(root = optimal_plans[frozenset(base_relations)])
		final_plan.prepare(self.db)
		plan = final_plan
		return final_plan
コード例 #2
0
  def pickJoinOrder(self, plan):
    # Extract all base relations, along with any unary operators immediately above.
    base_relations = set(plan.sources)

    # Extract all joins in original plan, they serve as the set of joins actually necessary.
    joins = set(plan.joins)

    # Define the dynamic programming table.
    optimal_plans = {}

    # Establish optimal access paths.
    for relation in base_relations:
      optimal_plans[frozenset((relation,))] = relation

    # Fill in the table.
    for i in range(2, len(base_relations) + 1):
      for subset in itertools.combinations(base_relations, i):

        # Build the set of candidate joins.
        candidate_joins = set()
        for candidate_relation in subset:
          candidate_joins.add((
            optimal_plans[frozenset(tuple_without(subset, candidate_relation))],
            optimal_plans[frozenset((candidate_relation,))]
          ))

        # Find the best of the candidate joins.
        optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins)

    # Reconstruct the best plan, prepare and return.
    final_plan = Plan(root = optimal_plans[frozenset(base_relations)])
    final_plan.prepare(self.db)
    return final_plan
コード例 #3
0
  def pickJoinOrder(self, plan):
    rels = set(plan.relations())
    optPlans = {} #Map a set of relations to the optimized plan
    #toBeProcessed = [] #Set of relations pending processing

    self.combsTried = 0
    self.plansProcessed = 0

    for r in rels:
      set_r = frozenset({r})
      #toBeProcessed.append(set_r)
      newScan = TableScan(r, self.db.relationSchema(r))
      newScan.prepare(self.db)
      optPlans[set_r] = newScan

    #For each join operator, fetch its relative relations
    #Map a set of relations to (relative relations, operator)
    joinMap = {}
    for (_, op) in plan.flatten():
      if isinstance(op, Join):
        relativeR = self.relativeRelations(rels, op)
        for r in [frozenset({r}) for r in relativeR]:
          if r in joinMap.keys():
            joinMap[r].append((relativeR, op))
          else:
            joinMap[r] = [(relativeR, op)]

    n = len(rels)
    for i in range(2, n + 1):
      for union in [frozenset(union) for union in self.kRelsComb(i, rels)]:
        for right in [frozenset(right) for right in self.kRelsComb(1, union)]:
          left = frozenset(union - right)
          for t in left:
            self.combsTried += 1
            value = joinMap[frozenset({t})]

            if not value:
              continue
            else:
              for tuple in value:
                if not (set(tuple[0]).issubset(union) and left in optPlans and right in optPlans):
                  continue

                self.plansProcessed += 1
                newJoin = Join(optPlans[left], optPlans[right], expr=tuple[1].joinExpr, method="block-nested-loops")
                newJoin.prepare(self.db)

                if not union in optPlans:
                  optPlans[union] = newJoin
                  self.addPlanCost(newJoin, newJoin.cost(estimated=True))
                else:
                  formerCost = self.getPlanCost(optPlans[union])
                  if newJoin.cost(estimated=True) < formerCost:
                    optPlans[union] = newJoin
                    self.addPlanCost(newJoin, newJoin.cost(estimated=True))

    newRoot = optPlans[frozenset(rels)]
    return Plan(root=newRoot)

    '''
コード例 #4
0
ファイル: Optimizer.py プロジェクト: yliu120/dbsystem
    def allAccessPaths(self, operator):
        totals = Plan(root=operator).flatten()
        joins = [j for (_, j) in totals if j.operatorType()[-4:] == "Join"]
        accPs = []
        for j in joins:
            accPs.append(j.lhsPlan)
            accPs.append(j.rhsPlan)

        return [p for p in accPs if self.isUnaryPath(p)]
コード例 #5
0
    def pickJoinOrder(self, plan):
        self.combsTried = 0
        self.plansProcessed = 0

        self.rels = set(plan.relations())
        #toBeProcessed = set()

        self.tableScans = {}
        for r in self.rels:
            ts = TableScan(r, self.db.relationSchema(r))
            ts.prepare(self.db)
            self.tableScans[frozenset({r})] = ts

        self.joinMap = {}
        for (_, op) in plan.flatten():
            if isinstance(op, Join):
                relativeR = self.relativeRelations(self.rels, op)
                for r in [frozenset({r}) for r in relativeR]:
                    if r in self.joinMap.keys():
                        self.joinMap[r].append((relativeR, op))
                    else:
                        self.joinMap[r] = [(relativeR, op)]

        n = len(self.rels)
        currBestPlan = None
        formerBestPlan = None
        formerRels = None
        currRels = None

        for i in range(2, n + 1):
            currBestCost = float('inf')
            if i == 2:
                for left in [frozenset({left}) for left in self.rels]:
                    (newCost, newJoin,
                     newRels) = self.processJoin(self.tableScans[left], left)

                    if newCost < currBestCost:
                        currRels = newRels
                        currBestPlan = newJoin
                        currBestCost = newCost
            else:
                (newCost, newJoin,
                 newRels) = self.processJoin(formerBestPlan, formerRels)

                if newCost < currBestCost:
                    currRels = newRels
                    currBestPlan = newJoin
                    currBestCost = newCost

            formerBestPlan = currBestPlan
            currBestPlan = None
            formerRels = currRels
            currRels = None

        newRoot = formerBestPlan
        return Plan(root=newRoot)
コード例 #6
0
  def get_best_join(self, candidates, required_joins):
    best_plan_cost = None
    best_plan = None
    for left, right in candidates:

      relevant_expr = None

      # Find the joinExpr corresponding to the current join candidate. If there is none, it's a
      # cartesian product.
      for join in required_joins:
        names = ExpressionInfo(join.joinExpr).getAttributes()
        if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names):
          relevant_expr = join.joinExpr
          break
      else:
        relevant_expr = 'True'

      # Construct a join plan for the current candidate, for each possible join algorithm.
      # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods.
      for algorithm in ["nested-loops", "block-nested-loops"]:
        test_plan = Plan(root = Join(
          lhsPlan = left,
          rhsPlan = right,
          method = algorithm,
          expr = relevant_expr
        ))

        # Prepare and run the plan in sampling mode, and get the estimated cost.
        test_plan.prepare(self.db)
        test_plan.sample(1.0)
        cost = test_plan.cost(estimated = True)

        # Update running best.
        if best_plan_cost is None or cost < best_plan_cost:
          best_plan_cost = cost
          best_plan = test_plan

    # Need to return the root operator rather than the plan itself, since it's going back into the
    # table.
    return best_plan.root
コード例 #7
0
ファイル: Optimizer.py プロジェクト: Brinews/jacky
	def get_best_join(self, candidates, required_joins):
		best_plan_cost = None
		best_plan = None
		for left, right in candidates:

			relevant_expr = None

			# Find the joinExpr corresponding to the current join candidate. If there is none, it's a
			# cartesian product.
			for join in required_joins:
				names = ExpressionInfo(join.joinExpr).getAttributes()
				if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names):
					relevant_expr = join.joinExpr
					break
			else:
				relevant_expr = 'True'

			# Construct a join plan for the current candidate, for each possible join algorithm.
			# TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods.
			for algorithm in ["nested-loops", "block-nested-loops", "hash"]:
				test_plan = Plan(root = Join(
					lhsPlan = left,
					rhsPlan = right,
					method = algorithm,
					expr = relevant_expr
				))

				# Prepare and run the plan in sampling mode, and get the estimated cost.
				test_plan.prepare(self.db)
				test_plan.sample(5.0)
				cost = test_plan.cost(estimated = True)

				# Update running best.
				if best_plan_cost is None or cost < best_plan_cost:
					best_plan_cost = cost
					best_plan = test_plan

		# Need to return the root operator rather than the plan itself, since it's going back into the
		# table.
		return best_plan.root
コード例 #8
0
ファイル: Optimizer.py プロジェクト: asubra12/DatabaseSystems
    def pickJoinOrder(self, plan):
        joins, tableIDs, optimalSubPlans, fields, nubPlan = self.optimizerSetup(
            plan)
        # Joins is a list of joins
        # TableIDs is a list of the operator on top of a tableScan or the scan itself (Select, Projcect)
        # optimalSubPlan is a dictionary where the key is the top operator ID (from TableID) and val is the operator
        # fields is a dictionary where key is top operator ID, val is the dictionary of fields

        if len(joins) == 0:
            return plan

        numTables = 2

        while numTables <= len(tableIDs):
            print('NumTables: ', numTables)
            joinOrderings = itertools.combinations(tableIDs, numTables)

            # Check each ordering, check each join method
            # Start with two tables total
            # pick one as the LHS, one as the RHS

            for joinOrdering in joinOrderings:  # This iterates through subsets of size numTables
                bestCost = 1e99
                bestPlan = None

                for rhsID in joinOrdering:  # Eventually we'll even iterate through swapping 2-joins
                    lhsIDs = list(joinOrdering)
                    lhsIDs.remove(rhsID)  # Make this one the right side join
                    lhsKey = frozenset(lhsIDs)  # Key for optimalSubPlan dict
                    rhsKey = frozenset([rhsID])  # Key for optimalSubPlan dict

                    cachedLHS = optimalSubPlans[
                        lhsKey] if lhsKey in optimalSubPlans else None  # Get the optimal subPlan
                    cachedRHS = optimalSubPlans[
                        rhsKey]  # Get the optimal subPlan

                    if cachedLHS is None or cachedRHS is None:
                        continue

                    # Do we even care about doing this join?
                    allAttributes = []

                    for lhsID in lhsIDs:
                        allAttributes.extend(fields[frozenset([
                            lhsID
                        ])])  # These are all the attributes in the join
                    allAttributes.extend(fields[rhsKey])

                    contains, planDict = self.checkJoins(
                        joins, allAttributes, cachedLHS, cachedRHS)

                    # print('Got to Contains')

                    if contains:

                        # print('PlanDict: ', planDict)

                        for joinMethod in [
                                "hash", "nested-loops", "block-nested-loops"
                        ]:

                            if joinMethod == "hash":
                                # print('HashMethod')
                                lhsPlan = cachedLHS
                                rhsPlan = cachedRHS
                                lhsHashFn = planDict['lhsHashFn']
                                rhsHashFn = planDict['rhsHashFn']
                                lhsKeySchema = planDict['lhsKeySchema']
                                rhsKeySchema = planDict['rhsKeySchema']

                                tryPlan = Plan(
                                    root=Join(method=joinMethod,
                                              lhsPlan=cachedLHS,
                                              lhsHashFn=lhsHashFn,
                                              lhsKeySchema=lhsKeySchema,
                                              rhsPlan=cachedRHS,
                                              rhsHashFn=rhsHashFn,
                                              rhsKeySchema=rhsKeySchema))

                                self.checkPlan = tryPlan
                                tryPlan.prepare(self.db)
                                tryPlan.sample(1.0)
                                cost = tryPlan.cost(estimated=True)
                                # print('HashCost: ', cost)

                            else:
                                # print(joinMethod)
                                joinExpr = planDict['joinExpr']
                                tryPlan = Plan(root=Join(lhsPlan=cachedLHS,
                                                         rhsPlan=cachedRHS,
                                                         method=joinMethod,
                                                         expr=joinExpr))

                                self.checkPlan = tryPlan
                                tryPlan.prepare(self.db)
                                tryPlan.sample(1.0)
                                cost = tryPlan.cost(estimated=True)
                                # print(joinMethod + ' Cost: ', cost)

                            if cost < bestCost:
                                bestCost = cost
                                bestPlan = tryPlan

                newKey = frozenset(joinOrdering)
                self.addPlanCost(newKey, bestCost, bestPlan)
                optimalSubPlans[
                    newKey] = bestPlan.root if bestPlan is not None else None

            numTables += 1

        nubPlan.subPlan = self.statsCache[frozenset(tableIDs)][1].root

        plan.prepare(self.db)

        return plan
コード例 #9
0
ファイル: Optimizer.py プロジェクト: asubra12/DatabaseSystems
    def pushdownOperators(self, plan):
        root = plan.root
        newPlan = self.singlePushDown(root)

        return Plan(root=newPlan)
コード例 #10
0
ファイル: Optimizer2.py プロジェクト: SamBrayman/DB
 def pushdownOperators(self, plan):
     return Plan(root=self.pushdownOperator(plan.root))
コード例 #11
0
ファイル: GreedyOptimizer.py プロジェクト: yliu120/dbsystem
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5

        n = len(aPaths)
        planList = []
        costList = []
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' total cost as 0.
            planList.append(aPath)
            costList.append((numPages, 0))
        # i = 2...n
        for i in range(1, n):
            # find all possible two way join in current planList
            # put the potential joins in potentialP
            # put the potential joins cost in potentialC
            m = len(planList)
            potentialP = []
            potentialC = []
            for j in range(0, m - 1):
                for k in range(j + 1, m):
                    self.pcntr += 1
                    potentialP.append((planList[j], planList[k]))
                    potentialC.append(3 * (costList[j][0] + costList[k][0]) +
                                      costList[j][1] + costList[k][1])
            # find the cheapest joinable join (total cost)
            # build the join, remove the used two base plan and add the new join to planList
            # modify the costList as well
            while (potentialC):
                currC = min(potentialC)
                currP = potentialP[potentialC.index(currC)]
                potentialC.remove(currC)
                potentialP.remove(currP)
                if (self.joinable(operator, currP)):
                    (lField, rField) = self.joinable(operator, currP)
                    lhsSchema = currP[0].schema()
                    rhsSchema = currP[1].schema()
                    lKeySchema = DBSchema(
                        'left',
                        [(f, t)
                         for (f, t) in lhsSchema.schema() if f == lField])
                    rKeySchema = DBSchema(
                        'right',
                        [(f, t)
                         for (f, t) in rhsSchema.schema() if f == rField])
                    lHashFn = 'hash(' + lField + ') % ' + str(
                        defaultPartiNumber)
                    rHashFn = 'hash(' + rField + ') % ' + str(
                        defaultPartiNumber)
                    newJoin = Join(currP[0], currP[1], method='hash', \
                                   lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                   rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                    newJoin.prepare(self.db)
                    totalCost = currC
                    cards = Plan(root=newJoin).sample(defaultScaleFactor)
                    pageSize, _, _ = self.db.storage.relationStats(
                        newJoin.relationId())
                    pages = cards / (pageSize / newJoin.schema().size)

                    id1 = planList.index(currP[0])
                    _ = planList.pop(id1)
                    id2 = planList.index(currP[1])
                    _ = planList.pop(id2)
                    planList.append(newJoin)
                    _ = costList.pop(id1)
                    _ = costList.pop(id2)
                    costList.append((pages, totalCost))
                    break
        print("GreedyOptimizer plan considered: ", self.pcntr)
        return planList[0]
コード例 #12
0
    def pickJoinOrder(self, plan):

        relations = plan.relations()
        fieldDict = self.obtainFieldDict(plan)

        (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
        # makes dicts that maps a list of relations to exprs involving that list
        # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
        # and on top of it the select exprs that involve 2 tables A,C or B,C

        isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
        outputSchema = plan.schema()
        optDict = {}
        self.reportPlanCount = 0

        for npass in range(1, len(relations) + 1):
            if npass == 1:
                for r in relations:
                    table = TableScan(r, self.db.relationSchema(r))
                    if (r, ) in selectTablesDict:
                        selectExprs = selectTablesDict[(r, )]
                        selectString = self.combineSelects(selectExprs)
                        select = Select(table, selectString)
                        optDict[(r, )] = Plan(root=select)
                    else:
                        optDict[(r, )] = Plan(root=table)
                    self.reportPlanCount += 1
            else:
                combinations = itertools.combinations(relations, npass)
                for c in combinations:
                    fullList = sorted(c)
                    clist = self.getCombos(fullList)
                    bestJoin = None
                    for subcombo in clist:
                        complement = self.getComplement(fullList, subcombo)

                        leftOps = optDict[tuple(complement)].root
                        rightOps = optDict[tuple(subcombo)].root

                        selectExpr = self.createExpression(
                            complement, subcombo, selectTablesDict)
                        joinExpr = self.createExpression(
                            complement, subcombo, joinTablesDict)

                        joinBnljOp = Join(leftOps,
                                          rightOps,
                                          expr=joinExpr,
                                          method="block-nested-loops")
                        fullBnljOp = Select(joinBnljOp, selectExpr)

                        if selectExpr == "True":
                            joinBnlj = Plan(root=joinBnljOp)
                        else:
                            joinBnlj = Plan(root=fullBnljOp)

                        joinBnlj.prepare(self.db)
                        joinBnlj.sample(100)

                        joinNljOp = Join(leftOps,
                                         rightOps,
                                         expr=joinExpr,
                                         method="nested-loops")
                        fullNljOp = Select(joinNljOp, selectExpr)

                        if selectExpr == "True":
                            joinNlj = Plan(root=joinNljOp)
                        else:
                            joinNlj = Plan(root=fullNljOp)

                        joinNlj.prepare(self.db)
                        joinNlj.sample(100)

                        if joinBnlj.cost(True) < joinNlj.cost(True):
                            if bestJoin == None or joinBnlj.cost(
                                    True) < bestJoin.cost(True):
                                bestJoin = joinBnlj
                        else:
                            if bestJoin == None or joinNlj.cost(
                                    True) < bestJoin.cost(True):
                                bestJoin = joinNlj

                        self.reportPlanCount += 2
                        self.clearSampleFiles()

                    optDict[tuple(fullList)] = bestJoin

        # after System R algorithm
        newPlan = optDict[tuple(sorted(relations))]

        if isGroupBy:
            newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
              aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
              aggExprs=plan.root.aggExprs, \
              groupHashFn=plan.root.groupHashFn)
            newGroupBy.prepare(self.db)
            newPlan = Plan(root=newGroupBy)

        if set(outputSchema.schema()) != set(newPlan.schema().schema()):
            projectDict = {}

            for f, t in outputSchema.schema():
                projectDict[f] = (f, t)

            currRoot = newPlan.root
            project = Project(currRoot, projectDict)
            project.prepare(self.db)
            newPlan = Plan(root=project)

        return newPlan
コード例 #13
0
ファイル: Optimizer.py プロジェクト: yliu120/dbsystem
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 50
        defaultPartiNumber = 5
        # build join constraint list;
        joinExprs = self.decodeJoinExprs(operator)
        # build a local plan-cost dict:
        prev = dict()
        curr = dict()
        n = len(aPaths)
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' totalcost as 0.
            self.addPlanCost(aPath, (numPages, 0))
            prev[aPath] = (numPages, 0)
        # i = 2...n
        for i in range(1, n):
            # build current list with prev.
            # For 2-way joins, we don't need to care left deep plan
            for p in prev.keys():
                accP = self.allAccessPaths(p)
                remL = [item for item in aPaths if item not in accP]
                for base in remL:
                    lhsSchema = p.schema()
                    rhsSchema = base.schema()
                    newJoin = None
                    (sCostL, tCostL) = prev[p]
                    (rPlan, costR) = self.getPlanCost(base)
                    # Here we are using System-R 's heuristic to eliminate permutations as
                    # much as possible.
                    # Reference: Selinger, 1979, http://www.cs.berkeley.edu/~brewer/cs262/3-selinger79.pdf
                    for (lField, rField) in joinExprs:
                        if lField in lhsSchema.fields and rField in rhsSchema.fields:
                            # Build Join
                            # We only select hashjoin for building join plans
                            # This is because the nested-loop-join contains a bug
                            lKeySchema = DBSchema('left', [
                                (f, t)
                                for (f, t) in lhsSchema.schema() if f == lField
                            ])
                            rKeySchema = DBSchema('right', [
                                (f, t)
                                for (f, t) in rhsSchema.schema() if f == rField
                            ])
                            lHashFn = 'hash(' + lField + ') % ' + str(
                                defaultPartiNumber)
                            rHashFn = 'hash(' + rField + ') % ' + str(
                                defaultPartiNumber)
                            newJoin = Join(p, rPlan, method='hash', \
                                           lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                           rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                        elif lField in rhsSchema.fields and rField in lhsSchema.fields:
                            # Build Join
                            # We only select hashjoin for building join plans
                            # This is because the nested-loop-join contains a bug
                            lKeySchema = DBSchema('left', [
                                (f, t)
                                for (f, t) in rhsSchema.schema() if f == lField
                            ])
                            rKeySchema = DBSchema('right', [
                                (f, t)
                                for (f, t) in lhsSchema.schema() if f == rField
                            ])
                            lHashFn = 'hash(' + rField + ') % ' + str(
                                defaultPartiNumber)
                            rHashFn = 'hash(' + lField + ') % ' + str(
                                defaultPartiNumber)
                            newJoin = Join(p, rPlan, method='hash', \
                                           lhsHashFn=lHashFn, lhsKeySchema=rKeySchema, \
                                           rhsHashFn=rHashFn, rhsKeySchema=lKeySchema)
                        else:
                            continue

                        if newJoin is not None:
                            # Let's push newJoin onto the cache and curr list
                            # cost: 3(M+N) + M's totalcost
                            # then we renew newJoin's stepcost
                            newJoin.prepare(self.db)
                            stepCost = 3 * (sCostL + costR[0])
                            totalCost = stepCost + tCostL
                            cards = Plan(
                                root=newJoin).sample(defaultScaleFactor)
                            pageSize, _, _ = self.db.storage.relationStats(
                                newJoin.relationId())
                            pages = cards / (pageSize / newJoin.schema().size)
                            self.addPlanCost(newJoin, (pages, totalCost))
                            curr[newJoin] = (pages, totalCost)

            prev = curr
            curr = dict()

        del prev, curr

        return self.getPlanCost(operator)[0]
コード例 #14
0
    def pickJoinOrder(self, plan):
        relations = plan.relations()
        fieldDict = self.obtainFieldDict(plan)
        (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
        # makes dicts that maps a list of relations to exprs involving that list
        # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
        # and on top of it the select exprs that involve 2 tables A,C or B,C

        isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
        outputSchema = plan.schema()
        self.reportPlanCount = 0

        worklist = []
        for r in relations:
            table = TableScan(r, self.db.relationSchema(r))
            table.prepare(self.db)
            if (r, ) in selectTablesDict:
                selectExprs = selectTablesDict[(r, )]
                selectString = self.combineSelects(selectExprs)
                select = Select(table, selectString)
                select.prepare(self.db)
                worklist.append(Plan(root=select))
            else:
                worklist.append(Plan(root=table))

        while (len(worklist) > 1):
            combos = itertools.combinations(worklist, 2)
            bestJoin = None
            sourcePair = None

            for pair in combos:
                op1 = pair[0].root
                op2 = pair[1].root

                selectExpr = self.createExpression(pair[0].relations(),
                                                   pair[1].relations(),
                                                   selectTablesDict)
                joinExpr = self.createExpression(pair[0].relations(),
                                                 pair[1].relations(),
                                                 joinTablesDict)

                join1BnljOp = Join(op1,
                                   op2,
                                   expr=joinExpr,
                                   method="block-nested-loops")
                join2BnljOp = Join(op2,
                                   op1,
                                   expr=joinExpr,
                                   method="block-nested-loops")

                join1NljOp = Join(op1,
                                  op2,
                                  expr=joinExpr,
                                  method="nested-loops")
                join2NljOp = Join(op2,
                                  op1,
                                  expr=joinExpr,
                                  method="nested-loops")

                if selectExpr == "True":
                    full1BnljOp = join1BnljOp
                    full2BnljOp = join2BnljOp

                    full1NljOp = join1NljOp
                    full2NljOp = join2NljOp

                else:
                    full1BnljOp = Select(join1BnljOp, selectExpr)
                    full2BnljOp = Select(join2BnljOp, selectExpr)

                    full1NljOp = Select(join1NljOp, selectExpr)
                    full2NljOp = Select(join2NljOp, selectExpr)

                joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp]

                for j in joinList:
                    joinplan = Plan(root=j)
                    joinplan.prepare(self.db)
                    joinplan.sample(100)

                    if bestJoin == None or joinplan.cost(True) < bestJoin.cost(
                            True):
                        bestJoin = joinplan
                        sourcePair = pair

                self.reportPlanCount += 4
                self.clearSampleFiles()

            worklist.remove(sourcePair[0])
            worklist.remove(sourcePair[1])
            worklist.append(bestJoin)

        # after System R algorithm
        newPlan = worklist[0]

        if isGroupBy:
            newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
              aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
              aggExprs=plan.root.aggExprs, \
              groupHashFn=plan.root.groupHashFn)
            newGroupBy.prepare(self.db)
            newPlan = Plan(root=newGroupBy)

        if set(outputSchema.schema()) != set(newPlan.schema().schema()):
            projectDict = {}

            for f, t in outputSchema.schema():
                projectDict[f] = (f, t)

            currRoot = newPlan.root
            project = Project(currRoot, projectDict)
            project.prepare(self.db)
            newPlan = Plan(root=project)

        return newPlan
コード例 #15
0
  def selectPushDown(self, plan):
    root = plan.root
    selectResult = []

    #New a stack and put info about op into it in the form of
    # (current op, parent op, accumulateSelect)
    queue = deque([(root, None, None)])

    while queue:
      (curr, parent, accuSelect) = queue.popleft()
      children = curr.inputs()

      if children:
        #When dealing with Select, collect select expressions into accumulate select
        if isinstance(curr, Select):
          if not accuSelect:
            accuSelect = []
          for decomp in ExpressionInfo(curr.selectExpr).decomposeCNF():
            accuSelect.append(decomp)

          queue.extendleft([(children[0], curr, accuSelect)])

        #Do not pushdown project at this point, so put it into result.
        #Accumulate select can always pass project
        elif isinstance(curr, Project):
          selectResult.append((curr, parent))
          queue.extendleft([(children[0], curr, accuSelect)])

        #When encountering a join, seperate the accumulate select expressions into three parts,
        #one part goes to left, one goes to right, and the remaining place above the join operator
        elif isinstance(curr, Join):
          leftSelect = []
          rightSelect = []
          newSelect = None
          leftFields = curr.lhsSchema.fields
          rightFields = curr.rhsSchema.fields
          put = []
          if accuSelect:
            for a in accuSelect:
              f = ExpressionInfo(a).getAttributes()
              flag = False
              if set(f).issubset(set(leftFields)):
                leftSelect.append(a)
                flag = True
              if set(f).issubset(set(rightFields)):
                rightSelect.append(a)
                flag = True
              if not flag:
                put.append(a)
            if put:
              newSelect = self.placeSelect(put, curr, parent, selectResult)

          if newSelect:
            selectResult.append((curr, newSelect))
          else:
            selectResult.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftSelect)])
          queue.extendleft([(curr.rhsPlan, curr, rightSelect)])

        #When encounter groupby, place all the accumulate select
        elif isinstance(curr, GroupBy):
          newSelect = self.placeSelect(accuSelect, curr, parent, selectResult)

          if newSelect:
            selectResult.append((curr, newSelect))
          else:
            selectResult.append((curr, parent))

          queue.extendleft([(children[0], curr, None)])

        #Deal with union similarly to join
        else:
          leftSelect = []
          rightSelect = []
          newSelect = None
          attrs = curr.unionSchema.fields
          put = []

          if accuSelect:
            for a in accuSelect:
              f = ExpressionInfo(a).getAttributes()
              if set(f).issubset(set(attrs)):
                leftSelect.append(a)
                rightSelect.append(a)
              else:
                put.append(a)

            newSelect = self.placeSelect(accuSelect, curr, parent, selectResult)

          if newSelect:
            selectResult.append((curr, newSelect))
          else:
            selectResult.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftSelect)])
          queue.extendleft([(curr.rhsPlan, curr, rightSelect)])

      #Deal with tablescan, place all the accumulate select
      else:
        newSelect = self.placeSelect(accuSelect, curr, parent, selectResult)

        if newSelect:
          selectResult.append((curr, newSelect))
        else:
          selectResult.append((curr, parent))

    newRoot = selectResult[0][0]
    return Plan(root=newRoot)
コード例 #16
0
ファイル: Optimizer.py プロジェクト: danielc518/dbsys
  def pickJoinOrder(self, plan):
    # Some restrictions apply:
    # 1. Cannot involve hash-joins or index-joins
    # 2. Only join operations beyond certain point (i.e. cannot have join -> aggregation -> join, etc.)

    tableIds = list()
    joinOps  = list()
    optPlans = dict()
    fields   = dict()

    firstOpWithJoins = self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields)

    if len(joinOps) == 0:
      return plan
    
    numTables = 2
    while numTables <= len(tableIds):
      possibleJoinOrders = itertools.combinations(tableIds, numTables)
      for possibleJoinOrder in possibleJoinOrders:
        minCost = None
        optPlan = None
        for tableId in possibleJoinOrder:
          # Left-deep-only optimizer (i.e. rhs operand is a base relation)
          lhsIds = list(possibleJoinOrder)
          lhsIds.remove(tableId)

          lhsJoinKey = self.getJoinKey(lhsIds)
          rhsJoinKey = str(tableId)

          lhsOpt = optPlans[lhsJoinKey] if lhsJoinKey in optPlans else None
          rhsOpt = optPlans[rhsJoinKey] if rhsJoinKey in optPlans else None

          if lhsOpt is None or rhsOpt is None:
            continue # Skip irrelevant joins

          # Form a list of available attributes of this join
          allAttrs = list()
          for lhsId in lhsIds:
            allAttrs.extend(fields[lhsId])

          allAttrs.extend(fields[tableId])

          currJoinExpr = None

          # Check whether any join expression can be satisfied with this join
          for join in joinOps:      
            if join.joinExpr:
              joinAttrs = ExpressionInfo(join.joinExpr).getAttributes()
              if self.contains(allAttrs, joinAttrs):
                currJoinExpr = join.joinExpr
                break
            else:
              # Should not involve hash-joins or index-joins (limitation)
              return plan

          if currJoinExpr is None:
            continue # Irrelevant join

          for joinMethod in ["nested-loops", "block-nested-loops"]:
            possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr))

            possiblePlan.prepare(self.db)
            possiblePlan.sample(1.0) # Sampling causes too much overhead!
            cost = self.getPlanCost(plan)
            cost = possiblePlan.cost(estimated=True) if cost is None else cost
            self.addPlanCost(plan, cost)

            if minCost is None or cost < minCost:
              minCost = cost
              optPlan = possiblePlan

        optPlans[self.getJoinKey(possibleJoinOrder)] = None if optPlan is None else optPlan.root 

      numTables = numTables + 1

    firstOpWithJoins.subPlan = optPlans[self.getJoinKey(tableIds)]

    plan.prepare(self.db)

    return plan
コード例 #17
0
ファイル: GreedyOptimizer.py プロジェクト: danielc518/dbsys
  def pickJoinOrder(self, plan):
    self.numPlansConsidered = 0

    tableIds = list()
    joinOps  = list()
    optPlans = dict()
    fields   = dict()

    self.extractJoinInfo(plan, tableIds, joinOps, optPlans, fields)

    # Create worklist consisting of table IDs
    worklist = [str(tableId) for tableId in tableIds]

    # Now work our way down the 'worklist' greedily
    numTables = len(worklist)
    while numTables >= 2:
      # Choose the cheapest join that can be made over the remaining sub-plans
      minCost  = None
      optPlan  = None
      optLhsId = None
      optRhsId = None

      possibleJoinOrders = itertools.combinations(worklist, 2)
      for possibleJoinOrder in possibleJoinOrders:
        # Start examining each possible plan
        lhsId = possibleJoinOrder[0]
        rhsId = possibleJoinOrder[1]

        lhsOpt = optPlans[lhsId] if lhsId in optPlans else None
        rhsOpt = optPlans[rhsId] if rhsId in optPlans else None

        if lhsOpt is None or rhsOpt is None:
          continue # Skip irrelevant joins

        # This is to take care of multi-way joins added to worklist
        lhsIds = lhsId.split(",")
        rhsIds = rhsId.split(",")

        # Form a list of available attributes of this join
        allAttrs = list()
        for lId in lhsIds:
          allAttrs.extend(fields[int(lId)])

        for rId in rhsIds:
          allAttrs.extend(fields[int(rId)])

        currJoinExpr = None

        # Check whether any join expression can be satisfied with this join
        for join in joinOps:   
          if join.joinExpr:
            joinAttrs = ExpressionInfo(join.joinExpr).getAttributes()
            if self.contains(allAttrs, joinAttrs):
              currJoinExpr = join.joinExpr
              break

        if currJoinExpr is None:
          continue # Skip irrelevant joins

        self.numPlansConsidered += 2
            
        # Compare costs of different type of joins 
        for joinMethod in ["nested-loops", "block-nested-loops"]:
          possiblePlan = Plan(root=Join(lhsPlan=lhsOpt, rhsPlan=rhsOpt, method=joinMethod, expr=currJoinExpr))

          possiblePlan.prepare(self.db)
          possiblePlan.sample(1.0) # Sampling causes too much overhead!
          cost = self.getPlanCost(plan)
          cost = possiblePlan.cost(estimated=True) if cost is None else cost
          self.addPlanCost(plan, cost)

          if minCost is None or cost < minCost:
            minCost  = cost
            optPlan  = possiblePlan
            optLhsId = lhsId
            optRhsId = rhsId

        # Switch left and right and compare again
        for joinMethod in ["nested-loops", "block-nested-loops"]:
          possiblePlan = Plan(root=Join(lhsPlan=rhsOpt, rhsPlan=lhsOpt, method=joinMethod, expr=currJoinExpr))

          possiblePlan.prepare(self.db)
          possiblePlan.sample(1.0) # Sampling causes too much overhead!
          cost = self.getPlanCost(plan)
          cost = possiblePlan.cost(estimated=True) if cost is None else cost
          self.addPlanCost(plan, cost)

          if minCost is None or cost < minCost:
            minCost  = cost
            optPlan  = possiblePlan
            optLhsId = rhsId
            optRhsId = lhsId

      if optPlan is not None:
        # Update optimal plan
        joinKey = optLhsId + "," + optRhsId
        optPlans[joinKey] = optPlan.root

        # Update worklist
        worklist.remove(optLhsId)
        worklist.remove(optRhsId)
        worklist.append(joinKey)

      numTables = numTables - 1

    # Return single plan left in worklist
    return optPlans[worklist[0]]
コード例 #18
0
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5
        # build join constraint list;
        joinExprs = self.decodeJoinExprs(operator)
        # build a local plan-cost dict:
        n = len(aPaths)
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' totalcost as 0.
            self.addPlanCost(aPath, (numPages, 0))

        for i in range(1, n):
            for S in comb(aPaths, i + 1):
                for O in self.powerSet(S):

                    # The following codes are added because some subPlans may
                    # not be present in the self.statsCache as
                    # 1) it was filtered out because it is a right-deep
                    # 2) it has not any constraint associated.
                    keyL = tuple(sorted(list(map(lambda x: x.id(), O))))
                    keyR = tuple(
                        sorted(
                            list(
                                map(lambda x: x.id(),
                                    [ele for ele in S if ele not in O]))))

                    planForO = None
                    remindPl = None
                    costL = None
                    costR = None

                    if keyL in self.statsCache and keyR in self.statsCache:
                        (planForO, costL) = self.statsCache[tuple(
                            sorted(list(map(lambda x: x.id(), O))))]
                        (remindPl, costR) = self.statsCache[tuple(
                            sorted(
                                list(
                                    map(lambda x: x.id(),
                                        [ele for ele in S if ele not in O]))))]
                    else:
                        continue

                    fields = self.joinable(joinExprs, [planForO, remindPl])
                    # If we detect constraints, we will create a new join from here.
                    if fields is not None:
                        lKeySchema = DBSchema(
                            'left', [(f, t)
                                     for (f, t) in planForO.schema().schema()
                                     if f == fields[0]])
                        rKeySchema = DBSchema(
                            'right', [(f, t)
                                      for (f, t) in remindPl.schema().schema()
                                      if f == fields[1]])
                        lHashFn = 'hash(' + fields[0] + ') % ' + str(
                            defaultPartiNumber)
                        rHashFn = 'hash(' + fields[1] + ') % ' + str(
                            defaultPartiNumber)
                        newJoin = Join(planForO, remindPl, method='hash', \
                                       lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                       rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)
                        if (i == 1) or (not self.isRightDeep(newJoin, aPaths)):
                            newJoin.prepare(self.db)
                            # Calculate output pages;
                            cards = Plan(
                                root=newJoin).sample(defaultScaleFactor)
                            pageSize, _, _ = self.db.storage.relationStats(
                                newJoin.relationId())
                            pages = cards / (pageSize / newJoin.schema().size)
                            # Calculate output costs:
                            totalCost = costL[1] + costR[1] + 3 * (costL[0] +
                                                                   costR[0])
                            # Add new Join to self.statsCache
                            self.addPlanCost(newJoin, (pages, totalCost))

        return self.getPlanCost(operator)[0]
コード例 #19
0
ファイル: Optimizer.py プロジェクト: elanas/DB_HW3
  def pickJoinOrder(self, plan):
    
    relations = plan.relations()
    fieldDict = self.obtainFieldDict(plan)
    

    (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
    # makes dicts that maps a list of relations to exprs involving that list
    # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
    # and on top of it the select exprs that involve 2 tables A,C or B,C

    isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
    outputSchema = plan.schema() 
    optDict = {}
    self.reportPlanCount = 0

    for npass in range(1, len(relations) + 1):
      if npass == 1:
        for r in relations:
          table = TableScan(r,self.db.relationSchema(r))
          if (r,) in selectTablesDict: 
            selectExprs = selectTablesDict[(r,)]
            selectString = self.combineSelects(selectExprs)
            select = Select(table,selectString)
            optDict[(r,)] = Plan(root=select)
          else:
            optDict[(r,)] = Plan(root=table)
          self.reportPlanCount += 1
      else:
        combinations = itertools.combinations(relations,npass)
        for c in combinations:
          fullList = sorted(c)
          clist = self.getCombos(fullList)
          bestJoin = None
          for subcombo in clist:
            complement = self.getComplement(fullList, subcombo)
            
            leftOps = optDict[tuple(complement)].root
            rightOps = optDict[tuple(subcombo)].root

            selectExpr = self.createExpression(complement, subcombo, selectTablesDict)
            joinExpr = self.createExpression(complement, subcombo, joinTablesDict)
            
            joinBnljOp = Join(leftOps, rightOps, expr=joinExpr, method="block-nested-loops" )
            fullBnljOp = Select(joinBnljOp, selectExpr)

            if selectExpr == "True":
              joinBnlj = Plan(root=joinBnljOp)
            else:
              joinBnlj = Plan(root=fullBnljOp)
            
            joinBnlj.prepare(self.db)
            joinBnlj.sample(100)
            
            joinNljOp = Join(leftOps, rightOps, expr=joinExpr, method="nested-loops" )
            fullNljOp = Select(joinNljOp, selectExpr)

            if selectExpr == "True":
              joinNlj = Plan(root=joinNljOp)
            else:
              joinNlj = Plan(root=fullNljOp)
            
            joinNlj.prepare(self.db)
            joinNlj.sample(100)

            if joinBnlj.cost(True) < joinNlj.cost(True):
              if bestJoin == None or joinBnlj.cost(True) < bestJoin.cost(True):
                bestJoin = joinBnlj
            else:
              if bestJoin == None or joinNlj.cost(True) < bestJoin.cost(True):
                bestJoin = joinNlj

            self.reportPlanCount += 2
            self.clearSampleFiles()

          optDict[tuple(fullList)] = bestJoin
          
    # after System R algorithm
    newPlan = optDict[tuple(sorted(relations))]

    if isGroupBy:
      newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
        aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
        aggExprs=plan.root.aggExprs, \
        groupHashFn=plan.root.groupHashFn)
      newGroupBy.prepare(self.db)
      newPlan = Plan(root=newGroupBy)

    if set(outputSchema.schema()) != set(newPlan.schema().schema()):
      projectDict = {}

      for f, t in outputSchema.schema():
        projectDict[f] = (f, t) 
      
      currRoot = newPlan.root
      project = Project(currRoot, projectDict)
      project.prepare(self.db)
      newPlan = Plan(root=project)
  
    return newPlan
コード例 #20
0
ファイル: Optimizer.py プロジェクト: elanas/DB_HW3
  def pickJoinOrder(self, plan):
    relations = plan.relations()
    fieldDict = self.obtainFieldDict(plan)
    (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
    # makes dicts that maps a list of relations to exprs involving that list
    # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
    # and on top of it the select exprs that involve 2 tables A,C or B,C

    isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
    outputSchema = plan.schema() 
    self.reportPlanCount = 0

    worklist = []
    for r in relations:
      table = TableScan(r,self.db.relationSchema(r))
      table.prepare(self.db)
      if (r,) in selectTablesDict: 
        selectExprs = selectTablesDict[(r,)]
        selectString = self.combineSelects(selectExprs)
        select = Select(table,selectString)
        select.prepare(self.db)
        worklist.append(Plan(root=select))
      else:
        worklist.append(Plan(root=table))

    while(len(worklist) > 1):
      combos = itertools.combinations(worklist,2)
      bestJoin = None
      sourcePair = None

      for pair in combos:
        op1 = pair[0].root
        op2 = pair[1].root

        selectExpr = self.createExpression(pair[0].relations(), pair[1].relations(), selectTablesDict)
        joinExpr = self.createExpression(pair[0].relations(), pair[1].relations(), joinTablesDict)
        
        join1BnljOp = Join(op1, op2, expr=joinExpr, method="block-nested-loops" )
        join2BnljOp = Join(op2, op1, expr=joinExpr, method="block-nested-loops" )


        join1NljOp = Join(op1, op2, expr=joinExpr, method="nested-loops" )
        join2NljOp = Join(op2, op1, expr=joinExpr, method="nested-loops" )

        if selectExpr == "True":
          full1BnljOp = join1BnljOp
          full2BnljOp = join2BnljOp
          
          full1NljOp = join1NljOp
          full2NljOp = join2NljOp

        else:
          full1BnljOp = Select(join1BnljOp, selectExpr)
          full2BnljOp = Select(join2BnljOp, selectExpr)
          
          full1NljOp = Select(join1NljOp, selectExpr)
          full2NljOp = Select(join2NljOp, selectExpr)
        

        joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp]

        for j in joinList:
          joinplan = Plan(root=j)
          joinplan.prepare(self.db)
          joinplan.sample(100)

          if bestJoin == None or joinplan.cost(True) < bestJoin.cost(True):
            bestJoin = joinplan
            sourcePair = pair

        self.reportPlanCount += 4
        self.clearSampleFiles()



      worklist.remove(sourcePair[0])
      worklist.remove(sourcePair[1])
      worklist.append(bestJoin)

    # after System R algorithm
    newPlan = worklist[0]

    if isGroupBy:
      newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
        aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
        aggExprs=plan.root.aggExprs, \
        groupHashFn=plan.root.groupHashFn)
      newGroupBy.prepare(self.db)
      newPlan = Plan(root=newGroupBy)

    if set(outputSchema.schema()) != set(newPlan.schema().schema()):
      projectDict = {}

      for f, t in outputSchema.schema():
        projectDict[f] = (f, t) 
      
      currRoot = newPlan.root
      project = Project(currRoot, projectDict)
      project.prepare(self.db)
      newPlan = Plan(root=project)
  
    return newPlan
コード例 #21
0
  def projectPushDown(self, plan):
    root = plan.root
    result = []

    #Keep info in the form (current op, parent, accumulate Porject)
    queue = deque([(root, None, None)])

    while queue:
      (curr, parent, accuProject) = queue.popleft()
      children = curr.inputs()

      if children:
        #Add current project into accumulate project
        if isinstance(curr, Project):
          if not accuProject:
            accuProject = curr.projectExprs
          else:
            accuProject.update({curr.projectExprs})

          queue.extendleft([(children[0], curr, accuProject)])

        elif isinstance(curr, Select):
          newProject = None
          if accuProject:
            selectAttrs = ExpressionInfo(curr.selectExpr).getAttributes()
            projectAttrs = self.getProjectAttrs(accuProject)
            newProject = Project(curr, accuProject)
            if set(selectAttrs).issubset(set(projectAttrs)):
              result.append((curr, parent))
              queue.extendleft([(children[0], curr, accuProject)])
              '''
              #If considering the order of select and project:
              #Project can go through select
              #but if the selectivity of select is smaller, we do not let project pass
              curr.useSampling(sampled=True, sampleFactor=10.0)
              newProject.useSampling(sampled=True, sampleFactor=10.0)
              if curr.selectivity(estimated=True) < newProject.selectivity(estimated=True):
                result.append((newProject, parent))
                result.append((curr, newProject))
                queue.extendleft([(children[0], curr, None)])
              else:
                result.append((curr, parent))
                queue.extendleft([(children[0], curr, accuProject)])
              '''
            #If select operation has attributes that don't belongs to project
            #project has to stop here
            else:
              result.append((newProject, parent))
              result.append((curr, newProject))
              queue.extendleft([(children[0], curr, None)])

          else:
            result.append((curr, parent))
            queue.extendleft([(children[0], curr, accuProject)])

        elif isinstance(curr, Join):
          #If we don't decompose project
          if accuProject:
            newProject = Project(curr, accuProject)
            result.append((newProject, parent))
            result.append((curr, newProject))
          else:
            result.append((curr, parent))
          queue.extendleft([(curr.lhsPlan, curr, None)])
          queue.extendleft([(curr.rhsPlan, curr, None)])
          '''
          #This part can be used to decompose project operation
          leftProject = {}
          rightProject = {}
          newProject = None
          leftFields = curr.lhsSchema.fields
          rightFields = curr.rhsSchema.fields
          put = {}

          if accuProject:
            projectAttrs = self.getProjectAttrs(accuProject)
            joinAttrs = ExpressionInfo(curr.joinExpr).getAttributes()
            if set(joinAttrs).issubset(set(projectAttrs)):
              for (k,v) in accuProject.items():
                flag = False
                f = ExpressionInfo(k).getAttributes()
                if set(f).issubset(set(leftFields)):
                  leftProject.update({k: v})
                  flag = True
                if set(f).issubset(set(rightFields)):
                  rightProject.update({k: v})
                  flag = True
                if not flag:
                  put.update({k: v})

              if put:
                newProject = Project(curr, put)
                result.append((newProject, parent))

            else:
              newProject = Project(curr, accuProject)
              result.append((newProject, parent))

          if newProject:
            result.append((curr, newProject))
          else:
            result.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftProject)])
          queue.extendleft([(curr.rhsPlan, curr, rightProject)])
          '''

        elif isinstance(curr, GroupBy):
          newProject = None

          if accuProject:
            newProject = Project(curr, accuProject)
            result.append((newProject, parent))


          if newProject:
            result.append((curr, newProject))
          else:
            result.append((curr, parent))

          queue.extendleft([(children[0], curr, None)])

        else:
          #If we don't decompose project
          if accuProject:
            newProject = Project(curr, accuProject)
            result.append((newProject, parent))
            result.append((curr, newProject))
          else:
            result.append((curr, parent))
          queue.extendleft([(curr.lhsPlan, curr, None)])
          queue.extendleft([(curr.rhsPlan, curr, None)])
          '''
          #This part can be used to decompose project
          leftProject = {}
          rightProject = {}
          newProject = None
          attrs = curr.unionSchema.fields
          put = {}

          if accuProject:
            projectAttrs = self.getProjectAttrs(accuProject)
            if set(attrs).issubset(set(projectAttrs)):
              leftProject = accuProject
              rightProject = accuProject
            else:
              newProject = Project(curr, accuProject)
              result.append((newProject, parent))

          if newProject:
            result.append((curr, newProject))
          else:
            result.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftProject)])
          queue.extendleft([(curr.rhsPlan, curr, rightProject)])
          '''

      else:
        newProject = None
        if accuProject:
          newProject = Project(curr, accuProject)
        if newProject:
          result.append((newProject, parent))
          result.append((curr, newProject))
        else:
          result.append((curr, parent))

    newRoot = result[0][0]
    return Plan(root=newRoot)
コード例 #22
0
 def pushdownOperators(self, plan):
   #raise NotImplementedError
   if plan:
     planRoot = self.pushdownHelper(plan.root)
     return Plan(root=planRoot)