コード例 #1
0
  def pickJoinOrder(self, plan):
    # Extract all base relations, along with any unary operators immediately above.
    base_relations = set(plan.sources)

    # Extract all joins in original plan, they serve as the set of joins actually necessary.
    joins = set(plan.joins)

    # Define the dynamic programming table.
    optimal_plans = {}

    # Establish optimal access paths.
    for relation in base_relations:
      optimal_plans[frozenset((relation,))] = relation

    # Fill in the table.
    for i in range(2, len(base_relations) + 1):
      for subset in itertools.combinations(base_relations, i):

        # Build the set of candidate joins.
        candidate_joins = set()
        for candidate_relation in subset:
          candidate_joins.add((
            optimal_plans[frozenset(tuple_without(subset, candidate_relation))],
            optimal_plans[frozenset((candidate_relation,))]
          ))

        # Find the best of the candidate joins.
        optimal_plans[frozenset(subset)] = self.get_best_join(candidate_joins, joins)

    # Reconstruct the best plan, prepare and return.
    final_plan = Plan(root = optimal_plans[frozenset(base_relations)])
    final_plan.prepare(self.db)
    return final_plan
コード例 #2
0
  def pickJoinOrder(self, plan):
    rels = set(plan.relations())
    optPlans = {} #Map a set of relations to the optimized plan
    #toBeProcessed = [] #Set of relations pending processing

    self.combsTried = 0
    self.plansProcessed = 0

    for r in rels:
      set_r = frozenset({r})
      #toBeProcessed.append(set_r)
      newScan = TableScan(r, self.db.relationSchema(r))
      newScan.prepare(self.db)
      optPlans[set_r] = newScan

    #For each join operator, fetch its relative relations
    #Map a set of relations to (relative relations, operator)
    joinMap = {}
    for (_, op) in plan.flatten():
      if isinstance(op, Join):
        relativeR = self.relativeRelations(rels, op)
        for r in [frozenset({r}) for r in relativeR]:
          if r in joinMap.keys():
            joinMap[r].append((relativeR, op))
          else:
            joinMap[r] = [(relativeR, op)]

    n = len(rels)
    for i in range(2, n + 1):
      for union in [frozenset(union) for union in self.kRelsComb(i, rels)]:
        for right in [frozenset(right) for right in self.kRelsComb(1, union)]:
          left = frozenset(union - right)
          for t in left:
            self.combsTried += 1
            value = joinMap[frozenset({t})]

            if not value:
              continue
            else:
              for tuple in value:
                if not (set(tuple[0]).issubset(union) and left in optPlans and right in optPlans):
                  continue

                self.plansProcessed += 1
                newJoin = Join(optPlans[left], optPlans[right], expr=tuple[1].joinExpr, method="block-nested-loops")
                newJoin.prepare(self.db)

                if not union in optPlans:
                  optPlans[union] = newJoin
                  self.addPlanCost(newJoin, newJoin.cost(estimated=True))
                else:
                  formerCost = self.getPlanCost(optPlans[union])
                  if newJoin.cost(estimated=True) < formerCost:
                    optPlans[union] = newJoin
                    self.addPlanCost(newJoin, newJoin.cost(estimated=True))

    newRoot = optPlans[frozenset(rels)]
    return Plan(root=newRoot)

    '''
コード例 #3
0
ファイル: Optimizer.py プロジェクト: yliu120/dbsystem
    def allAccessPaths(self, operator):
        totals = Plan(root=operator).flatten()
        joins = [j for (_, j) in totals if j.operatorType()[-4:] == "Join"]
        accPs = []
        for j in joins:
            accPs.append(j.lhsPlan)
            accPs.append(j.rhsPlan)

        return [p for p in accPs if self.isUnaryPath(p)]
コード例 #4
0
    def pickJoinOrder(self, plan):
        self.combsTried = 0
        self.plansProcessed = 0

        self.rels = set(plan.relations())
        #toBeProcessed = set()

        self.tableScans = {}
        for r in self.rels:
            ts = TableScan(r, self.db.relationSchema(r))
            ts.prepare(self.db)
            self.tableScans[frozenset({r})] = ts

        self.joinMap = {}
        for (_, op) in plan.flatten():
            if isinstance(op, Join):
                relativeR = self.relativeRelations(self.rels, op)
                for r in [frozenset({r}) for r in relativeR]:
                    if r in self.joinMap.keys():
                        self.joinMap[r].append((relativeR, op))
                    else:
                        self.joinMap[r] = [(relativeR, op)]

        n = len(self.rels)
        currBestPlan = None
        formerBestPlan = None
        formerRels = None
        currRels = None

        for i in range(2, n + 1):
            currBestCost = float('inf')
            if i == 2:
                for left in [frozenset({left}) for left in self.rels]:
                    (newCost, newJoin,
                     newRels) = self.processJoin(self.tableScans[left], left)

                    if newCost < currBestCost:
                        currRels = newRels
                        currBestPlan = newJoin
                        currBestCost = newCost
            else:
                (newCost, newJoin,
                 newRels) = self.processJoin(formerBestPlan, formerRels)

                if newCost < currBestCost:
                    currRels = newRels
                    currBestPlan = newJoin
                    currBestCost = newCost

            formerBestPlan = currBestPlan
            currBestPlan = None
            formerRels = currRels
            currRels = None

        newRoot = formerBestPlan
        return Plan(root=newRoot)
コード例 #5
0
  def get_best_join(self, candidates, required_joins):
    best_plan_cost = None
    best_plan = None
    for left, right in candidates:

      relevant_expr = None

      # Find the joinExpr corresponding to the current join candidate. If there is none, it's a
      # cartesian product.
      for join in required_joins:
        names = ExpressionInfo(join.joinExpr).getAttributes()
        if set(join.rhsSchema.fields).intersection(names) and set(join.lhsSchema.fields).intersection(names):
          relevant_expr = join.joinExpr
          break
      else:
        relevant_expr = 'True'

      # Construct a join plan for the current candidate, for each possible join algorithm.
      # TODO: Evaluate more than just nested loop joins, and determine feasibility of those methods.
      for algorithm in ["nested-loops", "block-nested-loops"]:
        test_plan = Plan(root = Join(
          lhsPlan = left,
          rhsPlan = right,
          method = algorithm,
          expr = relevant_expr
        ))

        # Prepare and run the plan in sampling mode, and get the estimated cost.
        test_plan.prepare(self.db)
        test_plan.sample(1.0)
        cost = test_plan.cost(estimated = True)

        # Update running best.
        if best_plan_cost is None or cost < best_plan_cost:
          best_plan_cost = cost
          best_plan = test_plan

    # Need to return the root operator rather than the plan itself, since it's going back into the
    # table.
    return best_plan.root
コード例 #6
0
ファイル: Optimizer.py プロジェクト: asubra12/DatabaseSystems
    def pushdownOperators(self, plan):
        root = plan.root
        newPlan = self.singlePushDown(root)

        return Plan(root=newPlan)
コード例 #7
0
ファイル: Optimizer.py プロジェクト: asubra12/DatabaseSystems
    def pickJoinOrder(self, plan):
        joins, tableIDs, optimalSubPlans, fields, nubPlan = self.optimizerSetup(
            plan)
        # Joins is a list of joins
        # TableIDs is a list of the operator on top of a tableScan or the scan itself (Select, Projcect)
        # optimalSubPlan is a dictionary where the key is the top operator ID (from TableID) and val is the operator
        # fields is a dictionary where key is top operator ID, val is the dictionary of fields

        if len(joins) == 0:
            return plan

        numTables = 2

        while numTables <= len(tableIDs):
            print('NumTables: ', numTables)
            joinOrderings = itertools.combinations(tableIDs, numTables)

            # Check each ordering, check each join method
            # Start with two tables total
            # pick one as the LHS, one as the RHS

            for joinOrdering in joinOrderings:  # This iterates through subsets of size numTables
                bestCost = 1e99
                bestPlan = None

                for rhsID in joinOrdering:  # Eventually we'll even iterate through swapping 2-joins
                    lhsIDs = list(joinOrdering)
                    lhsIDs.remove(rhsID)  # Make this one the right side join
                    lhsKey = frozenset(lhsIDs)  # Key for optimalSubPlan dict
                    rhsKey = frozenset([rhsID])  # Key for optimalSubPlan dict

                    cachedLHS = optimalSubPlans[
                        lhsKey] if lhsKey in optimalSubPlans else None  # Get the optimal subPlan
                    cachedRHS = optimalSubPlans[
                        rhsKey]  # Get the optimal subPlan

                    if cachedLHS is None or cachedRHS is None:
                        continue

                    # Do we even care about doing this join?
                    allAttributes = []

                    for lhsID in lhsIDs:
                        allAttributes.extend(fields[frozenset([
                            lhsID
                        ])])  # These are all the attributes in the join
                    allAttributes.extend(fields[rhsKey])

                    contains, planDict = self.checkJoins(
                        joins, allAttributes, cachedLHS, cachedRHS)

                    # print('Got to Contains')

                    if contains:

                        # print('PlanDict: ', planDict)

                        for joinMethod in [
                                "hash", "nested-loops", "block-nested-loops"
                        ]:

                            if joinMethod == "hash":
                                # print('HashMethod')
                                lhsPlan = cachedLHS
                                rhsPlan = cachedRHS
                                lhsHashFn = planDict['lhsHashFn']
                                rhsHashFn = planDict['rhsHashFn']
                                lhsKeySchema = planDict['lhsKeySchema']
                                rhsKeySchema = planDict['rhsKeySchema']

                                tryPlan = Plan(
                                    root=Join(method=joinMethod,
                                              lhsPlan=cachedLHS,
                                              lhsHashFn=lhsHashFn,
                                              lhsKeySchema=lhsKeySchema,
                                              rhsPlan=cachedRHS,
                                              rhsHashFn=rhsHashFn,
                                              rhsKeySchema=rhsKeySchema))

                                self.checkPlan = tryPlan
                                tryPlan.prepare(self.db)
                                tryPlan.sample(1.0)
                                cost = tryPlan.cost(estimated=True)
                                # print('HashCost: ', cost)

                            else:
                                # print(joinMethod)
                                joinExpr = planDict['joinExpr']
                                tryPlan = Plan(root=Join(lhsPlan=cachedLHS,
                                                         rhsPlan=cachedRHS,
                                                         method=joinMethod,
                                                         expr=joinExpr))

                                self.checkPlan = tryPlan
                                tryPlan.prepare(self.db)
                                tryPlan.sample(1.0)
                                cost = tryPlan.cost(estimated=True)
                                # print(joinMethod + ' Cost: ', cost)

                            if cost < bestCost:
                                bestCost = cost
                                bestPlan = tryPlan

                newKey = frozenset(joinOrdering)
                self.addPlanCost(newKey, bestCost, bestPlan)
                optimalSubPlans[
                    newKey] = bestPlan.root if bestPlan is not None else None

            numTables += 1

        nubPlan.subPlan = self.statsCache[frozenset(tableIDs)][1].root

        plan.prepare(self.db)

        return plan
コード例 #8
0
ファイル: GreedyOptimizer.py プロジェクト: yliu120/dbsystem
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5

        n = len(aPaths)
        planList = []
        costList = []
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' total cost as 0.
            planList.append(aPath)
            costList.append((numPages, 0))
        # i = 2...n
        for i in range(1, n):
            # find all possible two way join in current planList
            # put the potential joins in potentialP
            # put the potential joins cost in potentialC
            m = len(planList)
            potentialP = []
            potentialC = []
            for j in range(0, m - 1):
                for k in range(j + 1, m):
                    self.pcntr += 1
                    potentialP.append((planList[j], planList[k]))
                    potentialC.append(3 * (costList[j][0] + costList[k][0]) +
                                      costList[j][1] + costList[k][1])
            # find the cheapest joinable join (total cost)
            # build the join, remove the used two base plan and add the new join to planList
            # modify the costList as well
            while (potentialC):
                currC = min(potentialC)
                currP = potentialP[potentialC.index(currC)]
                potentialC.remove(currC)
                potentialP.remove(currP)
                if (self.joinable(operator, currP)):
                    (lField, rField) = self.joinable(operator, currP)
                    lhsSchema = currP[0].schema()
                    rhsSchema = currP[1].schema()
                    lKeySchema = DBSchema(
                        'left',
                        [(f, t)
                         for (f, t) in lhsSchema.schema() if f == lField])
                    rKeySchema = DBSchema(
                        'right',
                        [(f, t)
                         for (f, t) in rhsSchema.schema() if f == rField])
                    lHashFn = 'hash(' + lField + ') % ' + str(
                        defaultPartiNumber)
                    rHashFn = 'hash(' + rField + ') % ' + str(
                        defaultPartiNumber)
                    newJoin = Join(currP[0], currP[1], method='hash', \
                                   lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                   rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                    newJoin.prepare(self.db)
                    totalCost = currC
                    cards = Plan(root=newJoin).sample(defaultScaleFactor)
                    pageSize, _, _ = self.db.storage.relationStats(
                        newJoin.relationId())
                    pages = cards / (pageSize / newJoin.schema().size)

                    id1 = planList.index(currP[0])
                    _ = planList.pop(id1)
                    id2 = planList.index(currP[1])
                    _ = planList.pop(id2)
                    planList.append(newJoin)
                    _ = costList.pop(id1)
                    _ = costList.pop(id2)
                    costList.append((pages, totalCost))
                    break
        print("GreedyOptimizer plan considered: ", self.pcntr)
        return planList[0]
コード例 #9
0
ファイル: Optimizer2.py プロジェクト: SamBrayman/DB
 def pushdownOperators(self, plan):
     return Plan(root=self.pushdownOperator(plan.root))
コード例 #10
0
ファイル: Optimizer.py プロジェクト: yliu120/dbsystem
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 50
        defaultPartiNumber = 5
        # build join constraint list;
        joinExprs = self.decodeJoinExprs(operator)
        # build a local plan-cost dict:
        prev = dict()
        curr = dict()
        n = len(aPaths)
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' totalcost as 0.
            self.addPlanCost(aPath, (numPages, 0))
            prev[aPath] = (numPages, 0)
        # i = 2...n
        for i in range(1, n):
            # build current list with prev.
            # For 2-way joins, we don't need to care left deep plan
            for p in prev.keys():
                accP = self.allAccessPaths(p)
                remL = [item for item in aPaths if item not in accP]
                for base in remL:
                    lhsSchema = p.schema()
                    rhsSchema = base.schema()
                    newJoin = None
                    (sCostL, tCostL) = prev[p]
                    (rPlan, costR) = self.getPlanCost(base)
                    # Here we are using System-R 's heuristic to eliminate permutations as
                    # much as possible.
                    # Reference: Selinger, 1979, http://www.cs.berkeley.edu/~brewer/cs262/3-selinger79.pdf
                    for (lField, rField) in joinExprs:
                        if lField in lhsSchema.fields and rField in rhsSchema.fields:
                            # Build Join
                            # We only select hashjoin for building join plans
                            # This is because the nested-loop-join contains a bug
                            lKeySchema = DBSchema('left', [
                                (f, t)
                                for (f, t) in lhsSchema.schema() if f == lField
                            ])
                            rKeySchema = DBSchema('right', [
                                (f, t)
                                for (f, t) in rhsSchema.schema() if f == rField
                            ])
                            lHashFn = 'hash(' + lField + ') % ' + str(
                                defaultPartiNumber)
                            rHashFn = 'hash(' + rField + ') % ' + str(
                                defaultPartiNumber)
                            newJoin = Join(p, rPlan, method='hash', \
                                           lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                           rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                        elif lField in rhsSchema.fields and rField in lhsSchema.fields:
                            # Build Join
                            # We only select hashjoin for building join plans
                            # This is because the nested-loop-join contains a bug
                            lKeySchema = DBSchema('left', [
                                (f, t)
                                for (f, t) in rhsSchema.schema() if f == lField
                            ])
                            rKeySchema = DBSchema('right', [
                                (f, t)
                                for (f, t) in lhsSchema.schema() if f == rField
                            ])
                            lHashFn = 'hash(' + rField + ') % ' + str(
                                defaultPartiNumber)
                            rHashFn = 'hash(' + lField + ') % ' + str(
                                defaultPartiNumber)
                            newJoin = Join(p, rPlan, method='hash', \
                                           lhsHashFn=lHashFn, lhsKeySchema=rKeySchema, \
                                           rhsHashFn=rHashFn, rhsKeySchema=lKeySchema)
                        else:
                            continue

                        if newJoin is not None:
                            # Let's push newJoin onto the cache and curr list
                            # cost: 3(M+N) + M's totalcost
                            # then we renew newJoin's stepcost
                            newJoin.prepare(self.db)
                            stepCost = 3 * (sCostL + costR[0])
                            totalCost = stepCost + tCostL
                            cards = Plan(
                                root=newJoin).sample(defaultScaleFactor)
                            pageSize, _, _ = self.db.storage.relationStats(
                                newJoin.relationId())
                            pages = cards / (pageSize / newJoin.schema().size)
                            self.addPlanCost(newJoin, (pages, totalCost))
                            curr[newJoin] = (pages, totalCost)

            prev = curr
            curr = dict()

        del prev, curr

        return self.getPlanCost(operator)[0]
コード例 #11
0
  def selectPushDown(self, plan):
    root = plan.root
    selectResult = []

    #New a stack and put info about op into it in the form of
    # (current op, parent op, accumulateSelect)
    queue = deque([(root, None, None)])

    while queue:
      (curr, parent, accuSelect) = queue.popleft()
      children = curr.inputs()

      if children:
        #When dealing with Select, collect select expressions into accumulate select
        if isinstance(curr, Select):
          if not accuSelect:
            accuSelect = []
          for decomp in ExpressionInfo(curr.selectExpr).decomposeCNF():
            accuSelect.append(decomp)

          queue.extendleft([(children[0], curr, accuSelect)])

        #Do not pushdown project at this point, so put it into result.
        #Accumulate select can always pass project
        elif isinstance(curr, Project):
          selectResult.append((curr, parent))
          queue.extendleft([(children[0], curr, accuSelect)])

        #When encountering a join, seperate the accumulate select expressions into three parts,
        #one part goes to left, one goes to right, and the remaining place above the join operator
        elif isinstance(curr, Join):
          leftSelect = []
          rightSelect = []
          newSelect = None
          leftFields = curr.lhsSchema.fields
          rightFields = curr.rhsSchema.fields
          put = []
          if accuSelect:
            for a in accuSelect:
              f = ExpressionInfo(a).getAttributes()
              flag = False
              if set(f).issubset(set(leftFields)):
                leftSelect.append(a)
                flag = True
              if set(f).issubset(set(rightFields)):
                rightSelect.append(a)
                flag = True
              if not flag:
                put.append(a)
            if put:
              newSelect = self.placeSelect(put, curr, parent, selectResult)

          if newSelect:
            selectResult.append((curr, newSelect))
          else:
            selectResult.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftSelect)])
          queue.extendleft([(curr.rhsPlan, curr, rightSelect)])

        #When encounter groupby, place all the accumulate select
        elif isinstance(curr, GroupBy):
          newSelect = self.placeSelect(accuSelect, curr, parent, selectResult)

          if newSelect:
            selectResult.append((curr, newSelect))
          else:
            selectResult.append((curr, parent))

          queue.extendleft([(children[0], curr, None)])

        #Deal with union similarly to join
        else:
          leftSelect = []
          rightSelect = []
          newSelect = None
          attrs = curr.unionSchema.fields
          put = []

          if accuSelect:
            for a in accuSelect:
              f = ExpressionInfo(a).getAttributes()
              if set(f).issubset(set(attrs)):
                leftSelect.append(a)
                rightSelect.append(a)
              else:
                put.append(a)

            newSelect = self.placeSelect(accuSelect, curr, parent, selectResult)

          if newSelect:
            selectResult.append((curr, newSelect))
          else:
            selectResult.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftSelect)])
          queue.extendleft([(curr.rhsPlan, curr, rightSelect)])

      #Deal with tablescan, place all the accumulate select
      else:
        newSelect = self.placeSelect(accuSelect, curr, parent, selectResult)

        if newSelect:
          selectResult.append((curr, newSelect))
        else:
          selectResult.append((curr, parent))

    newRoot = selectResult[0][0]
    return Plan(root=newRoot)
コード例 #12
0
  def projectPushDown(self, plan):
    root = plan.root
    result = []

    #Keep info in the form (current op, parent, accumulate Porject)
    queue = deque([(root, None, None)])

    while queue:
      (curr, parent, accuProject) = queue.popleft()
      children = curr.inputs()

      if children:
        #Add current project into accumulate project
        if isinstance(curr, Project):
          if not accuProject:
            accuProject = curr.projectExprs
          else:
            accuProject.update({curr.projectExprs})

          queue.extendleft([(children[0], curr, accuProject)])

        elif isinstance(curr, Select):
          newProject = None
          if accuProject:
            selectAttrs = ExpressionInfo(curr.selectExpr).getAttributes()
            projectAttrs = self.getProjectAttrs(accuProject)
            newProject = Project(curr, accuProject)
            if set(selectAttrs).issubset(set(projectAttrs)):
              result.append((curr, parent))
              queue.extendleft([(children[0], curr, accuProject)])
              '''
              #If considering the order of select and project:
              #Project can go through select
              #but if the selectivity of select is smaller, we do not let project pass
              curr.useSampling(sampled=True, sampleFactor=10.0)
              newProject.useSampling(sampled=True, sampleFactor=10.0)
              if curr.selectivity(estimated=True) < newProject.selectivity(estimated=True):
                result.append((newProject, parent))
                result.append((curr, newProject))
                queue.extendleft([(children[0], curr, None)])
              else:
                result.append((curr, parent))
                queue.extendleft([(children[0], curr, accuProject)])
              '''
            #If select operation has attributes that don't belongs to project
            #project has to stop here
            else:
              result.append((newProject, parent))
              result.append((curr, newProject))
              queue.extendleft([(children[0], curr, None)])

          else:
            result.append((curr, parent))
            queue.extendleft([(children[0], curr, accuProject)])

        elif isinstance(curr, Join):
          #If we don't decompose project
          if accuProject:
            newProject = Project(curr, accuProject)
            result.append((newProject, parent))
            result.append((curr, newProject))
          else:
            result.append((curr, parent))
          queue.extendleft([(curr.lhsPlan, curr, None)])
          queue.extendleft([(curr.rhsPlan, curr, None)])
          '''
          #This part can be used to decompose project operation
          leftProject = {}
          rightProject = {}
          newProject = None
          leftFields = curr.lhsSchema.fields
          rightFields = curr.rhsSchema.fields
          put = {}

          if accuProject:
            projectAttrs = self.getProjectAttrs(accuProject)
            joinAttrs = ExpressionInfo(curr.joinExpr).getAttributes()
            if set(joinAttrs).issubset(set(projectAttrs)):
              for (k,v) in accuProject.items():
                flag = False
                f = ExpressionInfo(k).getAttributes()
                if set(f).issubset(set(leftFields)):
                  leftProject.update({k: v})
                  flag = True
                if set(f).issubset(set(rightFields)):
                  rightProject.update({k: v})
                  flag = True
                if not flag:
                  put.update({k: v})

              if put:
                newProject = Project(curr, put)
                result.append((newProject, parent))

            else:
              newProject = Project(curr, accuProject)
              result.append((newProject, parent))

          if newProject:
            result.append((curr, newProject))
          else:
            result.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftProject)])
          queue.extendleft([(curr.rhsPlan, curr, rightProject)])
          '''

        elif isinstance(curr, GroupBy):
          newProject = None

          if accuProject:
            newProject = Project(curr, accuProject)
            result.append((newProject, parent))


          if newProject:
            result.append((curr, newProject))
          else:
            result.append((curr, parent))

          queue.extendleft([(children[0], curr, None)])

        else:
          #If we don't decompose project
          if accuProject:
            newProject = Project(curr, accuProject)
            result.append((newProject, parent))
            result.append((curr, newProject))
          else:
            result.append((curr, parent))
          queue.extendleft([(curr.lhsPlan, curr, None)])
          queue.extendleft([(curr.rhsPlan, curr, None)])
          '''
          #This part can be used to decompose project
          leftProject = {}
          rightProject = {}
          newProject = None
          attrs = curr.unionSchema.fields
          put = {}

          if accuProject:
            projectAttrs = self.getProjectAttrs(accuProject)
            if set(attrs).issubset(set(projectAttrs)):
              leftProject = accuProject
              rightProject = accuProject
            else:
              newProject = Project(curr, accuProject)
              result.append((newProject, parent))

          if newProject:
            result.append((curr, newProject))
          else:
            result.append((curr, parent))

          queue.extendleft([(curr.lhsPlan, curr, leftProject)])
          queue.extendleft([(curr.rhsPlan, curr, rightProject)])
          '''

      else:
        newProject = None
        if accuProject:
          newProject = Project(curr, accuProject)
        if newProject:
          result.append((newProject, parent))
          result.append((curr, newProject))
        else:
          result.append((curr, parent))

    newRoot = result[0][0]
    return Plan(root=newRoot)
コード例 #13
0
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5
        # build join constraint list;
        joinExprs = self.decodeJoinExprs(operator)
        # build a local plan-cost dict:
        n = len(aPaths)
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' totalcost as 0.
            self.addPlanCost(aPath, (numPages, 0))

        for i in range(1, n):
            for S in comb(aPaths, i + 1):
                for O in self.powerSet(S):

                    # The following codes are added because some subPlans may
                    # not be present in the self.statsCache as
                    # 1) it was filtered out because it is a right-deep
                    # 2) it has not any constraint associated.
                    keyL = tuple(sorted(list(map(lambda x: x.id(), O))))
                    keyR = tuple(
                        sorted(
                            list(
                                map(lambda x: x.id(),
                                    [ele for ele in S if ele not in O]))))

                    planForO = None
                    remindPl = None
                    costL = None
                    costR = None

                    if keyL in self.statsCache and keyR in self.statsCache:
                        (planForO, costL) = self.statsCache[tuple(
                            sorted(list(map(lambda x: x.id(), O))))]
                        (remindPl, costR) = self.statsCache[tuple(
                            sorted(
                                list(
                                    map(lambda x: x.id(),
                                        [ele for ele in S if ele not in O]))))]
                    else:
                        continue

                    fields = self.joinable(joinExprs, [planForO, remindPl])
                    # If we detect constraints, we will create a new join from here.
                    if fields is not None:
                        lKeySchema = DBSchema(
                            'left', [(f, t)
                                     for (f, t) in planForO.schema().schema()
                                     if f == fields[0]])
                        rKeySchema = DBSchema(
                            'right', [(f, t)
                                      for (f, t) in remindPl.schema().schema()
                                      if f == fields[1]])
                        lHashFn = 'hash(' + fields[0] + ') % ' + str(
                            defaultPartiNumber)
                        rHashFn = 'hash(' + fields[1] + ') % ' + str(
                            defaultPartiNumber)
                        newJoin = Join(planForO, remindPl, method='hash', \
                                       lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                       rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)
                        if (i == 1) or (not self.isRightDeep(newJoin, aPaths)):
                            newJoin.prepare(self.db)
                            # Calculate output pages;
                            cards = Plan(
                                root=newJoin).sample(defaultScaleFactor)
                            pageSize, _, _ = self.db.storage.relationStats(
                                newJoin.relationId())
                            pages = cards / (pageSize / newJoin.schema().size)
                            # Calculate output costs:
                            totalCost = costL[1] + costR[1] + 3 * (costL[0] +
                                                                   costR[0])
                            # Add new Join to self.statsCache
                            self.addPlanCost(newJoin, (pages, totalCost))

        return self.getPlanCost(operator)[0]
コード例 #14
0
    def pickJoinOrder(self, plan):
        relations = plan.relations()
        fieldDict = self.obtainFieldDict(plan)
        (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
        # makes dicts that maps a list of relations to exprs involving that list
        # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
        # and on top of it the select exprs that involve 2 tables A,C or B,C

        isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
        outputSchema = plan.schema()
        self.reportPlanCount = 0

        worklist = []
        for r in relations:
            table = TableScan(r, self.db.relationSchema(r))
            table.prepare(self.db)
            if (r, ) in selectTablesDict:
                selectExprs = selectTablesDict[(r, )]
                selectString = self.combineSelects(selectExprs)
                select = Select(table, selectString)
                select.prepare(self.db)
                worklist.append(Plan(root=select))
            else:
                worklist.append(Plan(root=table))

        while (len(worklist) > 1):
            combos = itertools.combinations(worklist, 2)
            bestJoin = None
            sourcePair = None

            for pair in combos:
                op1 = pair[0].root
                op2 = pair[1].root

                selectExpr = self.createExpression(pair[0].relations(),
                                                   pair[1].relations(),
                                                   selectTablesDict)
                joinExpr = self.createExpression(pair[0].relations(),
                                                 pair[1].relations(),
                                                 joinTablesDict)

                join1BnljOp = Join(op1,
                                   op2,
                                   expr=joinExpr,
                                   method="block-nested-loops")
                join2BnljOp = Join(op2,
                                   op1,
                                   expr=joinExpr,
                                   method="block-nested-loops")

                join1NljOp = Join(op1,
                                  op2,
                                  expr=joinExpr,
                                  method="nested-loops")
                join2NljOp = Join(op2,
                                  op1,
                                  expr=joinExpr,
                                  method="nested-loops")

                if selectExpr == "True":
                    full1BnljOp = join1BnljOp
                    full2BnljOp = join2BnljOp

                    full1NljOp = join1NljOp
                    full2NljOp = join2NljOp

                else:
                    full1BnljOp = Select(join1BnljOp, selectExpr)
                    full2BnljOp = Select(join2BnljOp, selectExpr)

                    full1NljOp = Select(join1NljOp, selectExpr)
                    full2NljOp = Select(join2NljOp, selectExpr)

                joinList = [full1BnljOp, full2BnljOp, full1NljOp, full2NljOp]

                for j in joinList:
                    joinplan = Plan(root=j)
                    joinplan.prepare(self.db)
                    joinplan.sample(100)

                    if bestJoin == None or joinplan.cost(True) < bestJoin.cost(
                            True):
                        bestJoin = joinplan
                        sourcePair = pair

                self.reportPlanCount += 4
                self.clearSampleFiles()

            worklist.remove(sourcePair[0])
            worklist.remove(sourcePair[1])
            worklist.append(bestJoin)

        # after System R algorithm
        newPlan = worklist[0]

        if isGroupBy:
            newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
              aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
              aggExprs=plan.root.aggExprs, \
              groupHashFn=plan.root.groupHashFn)
            newGroupBy.prepare(self.db)
            newPlan = Plan(root=newGroupBy)

        if set(outputSchema.schema()) != set(newPlan.schema().schema()):
            projectDict = {}

            for f, t in outputSchema.schema():
                projectDict[f] = (f, t)

            currRoot = newPlan.root
            project = Project(currRoot, projectDict)
            project.prepare(self.db)
            newPlan = Plan(root=project)

        return newPlan
コード例 #15
0
    def pickJoinOrder(self, plan):

        relations = plan.relations()
        fieldDict = self.obtainFieldDict(plan)

        (joinTablesDict, selectTablesDict) = self.getExprDicts(plan, fieldDict)
        # makes dicts that maps a list of relations to exprs involving that list
        # then in system R we will build opt(A,B) Join C using join exprs involving A,C and B,C
        # and on top of it the select exprs that involve 2 tables A,C or B,C

        isGroupBy = True if plan.root.operatorType() == "GroupBy" else False
        outputSchema = plan.schema()
        optDict = {}
        self.reportPlanCount = 0

        for npass in range(1, len(relations) + 1):
            if npass == 1:
                for r in relations:
                    table = TableScan(r, self.db.relationSchema(r))
                    if (r, ) in selectTablesDict:
                        selectExprs = selectTablesDict[(r, )]
                        selectString = self.combineSelects(selectExprs)
                        select = Select(table, selectString)
                        optDict[(r, )] = Plan(root=select)
                    else:
                        optDict[(r, )] = Plan(root=table)
                    self.reportPlanCount += 1
            else:
                combinations = itertools.combinations(relations, npass)
                for c in combinations:
                    fullList = sorted(c)
                    clist = self.getCombos(fullList)
                    bestJoin = None
                    for subcombo in clist:
                        complement = self.getComplement(fullList, subcombo)

                        leftOps = optDict[tuple(complement)].root
                        rightOps = optDict[tuple(subcombo)].root

                        selectExpr = self.createExpression(
                            complement, subcombo, selectTablesDict)
                        joinExpr = self.createExpression(
                            complement, subcombo, joinTablesDict)

                        joinBnljOp = Join(leftOps,
                                          rightOps,
                                          expr=joinExpr,
                                          method="block-nested-loops")
                        fullBnljOp = Select(joinBnljOp, selectExpr)

                        if selectExpr == "True":
                            joinBnlj = Plan(root=joinBnljOp)
                        else:
                            joinBnlj = Plan(root=fullBnljOp)

                        joinBnlj.prepare(self.db)
                        joinBnlj.sample(100)

                        joinNljOp = Join(leftOps,
                                         rightOps,
                                         expr=joinExpr,
                                         method="nested-loops")
                        fullNljOp = Select(joinNljOp, selectExpr)

                        if selectExpr == "True":
                            joinNlj = Plan(root=joinNljOp)
                        else:
                            joinNlj = Plan(root=fullNljOp)

                        joinNlj.prepare(self.db)
                        joinNlj.sample(100)

                        if joinBnlj.cost(True) < joinNlj.cost(True):
                            if bestJoin == None or joinBnlj.cost(
                                    True) < bestJoin.cost(True):
                                bestJoin = joinBnlj
                        else:
                            if bestJoin == None or joinNlj.cost(
                                    True) < bestJoin.cost(True):
                                bestJoin = joinNlj

                        self.reportPlanCount += 2
                        self.clearSampleFiles()

                    optDict[tuple(fullList)] = bestJoin

        # after System R algorithm
        newPlan = optDict[tuple(sorted(relations))]

        if isGroupBy:
            newGroupBy = GroupBy(newPlan.root, groupSchema=plan.root.groupSchema, \
              aggSchema=plan.root.aggSchema, groupExpr=plan.root.groupExpr, \
              aggExprs=plan.root.aggExprs, \
              groupHashFn=plan.root.groupHashFn)
            newGroupBy.prepare(self.db)
            newPlan = Plan(root=newGroupBy)

        if set(outputSchema.schema()) != set(newPlan.schema().schema()):
            projectDict = {}

            for f, t in outputSchema.schema():
                projectDict[f] = (f, t)

            currRoot = newPlan.root
            project = Project(currRoot, projectDict)
            project.prepare(self.db)
            newPlan = Plan(root=project)

        return newPlan
コード例 #16
0
 def pushdownOperators(self, plan):
   #raise NotImplementedError
   if plan:
     planRoot = self.pushdownHelper(plan.root)
     return Plan(root=planRoot)