Beispiel #1
0
    def processAllPages(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        outputSchema = DBSchema(schema, fields)

        relIds = []
        for (pageId, page) in iter(self.subPlan):
            for tpl in page:
                group = self.groupExpr(self.subSchema.unpack(tpl))
                key = self.groupHashFn((group, None))
                relId = str(self.id) + "u" + str(key)
                self.storage.createRelation(relId, self.subSchema)
                self.storage.insertTuple(relId, tpl)
                if relId not in relIds:
                    relIds.append(relId)

        for rid in relIds:
            groupDict = {}
            for (pageId, page) in self.storage.pages(rid):
                for tpl in page:
                    groupKey = self.groupExpr(self.subSchema.unpack(tpl))
                    if groupKey not in groupDict:
                        groupDict[groupKey] = []
                        for trio in self.aggExprs:
                            groupDict[groupKey].append(trio[0])

                    for i in range(len(self.aggExprs)):
                        groupDict[groupKey][i] = self.aggExprs[i][1](
                            groupDict[groupKey][i], self.subSchema.unpack(tpl))

            for key in groupDict:
                for i in range(len(self.aggExprs)):
                    groupDict[key][i] = self.aggExprs[i][2](groupDict[key][i])

            for key in groupDict:
                outTuple = outputSchema.instantiate(
                    key, *[f for f in groupDict[key]])
                self.emitOutputTuple(self.outputSchema.pack(outTuple))
        return self.storage.pages(self.relationId())
Beispiel #2
0
class Join(Operator):
    def __init__(self, lhsPlan, rhsPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError("Pipelined join operator not supported")

        self.lhsPlan = lhsPlan
        self.rhsPlan = rhsPlan
        self.joinExpr = kwargs.get("expr", None)
        self.joinMethod = kwargs.get("method", None)
        self.lhsSchema = kwargs.get(
            "lhsSchema", None if lhsPlan is None else lhsPlan.schema())
        self.rhsSchema = kwargs.get(
            "rhsSchema", None if rhsPlan is None else rhsPlan.schema())

        self.lhsKeySchema = kwargs.get("lhsKeySchema", None)
        self.rhsKeySchema = kwargs.get("rhsKeySchema", None)
        self.lhsHashFn = kwargs.get("lhsHashFn", None)
        self.rhsHashFn = kwargs.get("rhsHashFn", None)

        self.validateJoin()
        self.initializeSchema()
        self.initializeMethod(**kwargs)

    # Checks the join parameters.
    def validateJoin(self):
        # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
        if self.joinMethod not in [
                "nested-loops", "block-nested-loops", "indexed", "hash"
        ]:
            raise ValueError("Invalid join method in join operator")

        # Check all fields are valid.
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            methodParams = [self.joinExpr]

        elif self.joinMethod == "indexed":
            methodParams = [self.lhsKeySchema]

        elif self.joinMethod == "hash":
            methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                            self.rhsHashFn, self.rhsKeySchema]

        requireAllValid = [self.lhsPlan, self.rhsPlan, \
                           self.joinMethod, \
                           self.lhsSchema, self.rhsSchema] \
                          + methodParams

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete join specification, missing join operator parameter"
            )

        # For now, we assume that the LHS and RHS schema have
        # disjoint attribute names, enforcing this here.
        for lhsAttr in self.lhsSchema.fields:
            if lhsAttr in self.rhsSchema.fields:
                raise ValueError(
                    "Invalid join inputs, overlapping schema detected")

    # Initializes the output schema for this join.
    # This is a concatenation of all fields in the lhs and rhs schema.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.lhsSchema.schema() + self.rhsSchema.schema()
        self.joinSchema = DBSchema(schema, fields)

    # Initializes any additional operator parameters based on the join method.
    def initializeMethod(self, **kwargs):
        if self.joinMethod == "indexed":
            self.indexId = kwargs.get("indexId", None)
            if self.indexId is None or self.lhsKeySchema is None:
                raise ValueError("Invalid index for use in join operator")

    # Returns the output schema of this operator
    def schema(self):
        return self.joinSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.lhsSchema, self.rhsSchema]

    # Returns a string describing the operator type
    def operatorType(self):
        readableJoinTypes = {
            'nested-loops': 'NL',
            'block-nested-loops': 'BNL',
            'indexed': 'Index',
            'hash': 'Hash'
        }
        return readableJoinTypes[self.joinMethod] + "Join"

    # Returns child operators if present
    def inputs(self):
        return [self.lhsPlan, self.rhsPlan]

    # Iterator abstraction for join operator.
    def __iter__(self):
        self.initializeOutput()
        self.outputIterator = self.processAllPages()

        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        if self.joinMethod == "nested-loops":
            return self.nestedLoops()

        elif self.joinMethod == "block-nested-loops":
            return self.blockNestedLoops()

        elif self.joinMethod == "indexed":
            return self.indexedNestedLoops()

        elif self.joinMethod == "hash":
            return self.hashJoin()

        else:
            raise ValueError("Invalid join method in join operator")

    ##################################
    #
    # Nested loops implementation
    #
    def nestedLoops(self):
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for lTuple in lhsPage:
                # Load the lhs once per inner loop.
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, rhsPage) in iter(self.rhsPlan):
                    for rTuple in rhsPage:
                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate the join predicate, and output if we have a match.
                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                # No need to track anything but the last output page when in batch mode.
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Block nested loops implementation
    #
    # This attempts to use all the free pages in the buffer pool
    # for its block of the outer relation.

    # Accesses a block of pages from an iterator.
    # This method pins pages in the buffer pool during its access.
    # We track the page ids in the block to unpin them after processing the block.
    def accessPageBlock(self, bufPool, pageIterator):
        blockIds = []
        while bufPool.numFreePages() > 0:
            try:
                pId, page = next(pageIterator)
                bufPool.getPage(pId, pinned=True)
                blockIds.append(pId)
            except StopIteration:
                pageIterator = None
                break

        return (blockIds, pageIterator)

    def blockNestedLoops(self):
        self._blockNestedLoops(iter(self.lhsPlan), iter(self.rhsPlan))
        return self.storage.pages(self.relationId())

    def _blockNestedLoops(self, lPageIter, rPageIter):
        while lPageIter is not None:

            blockIds, lPageIter = self.accessPageBlock(self.storage.bufferPool,
                                                       lPageIter)

            for lPageId in blockIds:
                lPage = self.storage.bufferPool.getPage(lPageId)
                for lTuple in lPage:
                    joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                    for (rPageId, rPage) in rPageIter:
                        for rTuple in rPage:
                            joinExprEnv.update(
                                self.loadSchema(self.rhsSchema, rTuple))

                            if self.joinExpr:
                                isValid = eval(self.joinExpr, globals(),
                                               joinExprEnv)
                            else:
                                # For some reason using this comparison causes the test to fail.
                                #
                                # lKey = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
                                # rKey = self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema)
                                # isValid = lKey == rKey
                                isValid = True

                            if isValid:
                                outputTuple = self.joinSchema.instantiate(*[
                                    joinExprEnv[f]
                                    for f in self.joinSchema.fields
                                ])
                                self.emitOutputTuple(
                                    self.joinSchema.pack(outputTuple))

                    if self.outputPages:
                        self.outputPages = [self.outputPages[-1]]

                self.storage.bufferPool.unpinPage(lPageId)

    ##################################
    #
    # Indexed nested loops implementation
    #
    # TODO: test
    def indexedNestedLoops(self):
        raise NotImplementedError

    ##################################
    #
    # Hash join implementation.
    #
    def hashJoin(self):
        lRelHashMap = self.hashPartition(self.lhsPlan, self.lhsHashFn,
                                         self.lhsSchema, "_lhs")
        rRelHashMap = self.hashPartition(self.rhsPlan, self.rhsHashFn,
                                         self.rhsSchema, "_rhs")

        for hashVal in lRelHashMap.keys():
            lPageIter = self.storage.pages(lRelHashMap[hashVal])
            rPageIter = self.storage.pages(rRelHashMap[hashVal])

            self._blockNestedLoops(lPageIter, rPageIter)

            self.storage.removeRelation(lRelHashMap[hashVal])
            self.storage.removeRelation(rRelHashMap[hashVal])

        return self.storage.pages(self.relationId())

    def hashPartition(self, plan, hashFn, schema, side):
        relHashMap = {}
        for (pagId, page) in iter(plan):
            for tup in page:

                hashVal = str(
                    eval(hashFn, globals(), self.loadSchema(schema, tup)))

                if hashVal not in relHashMap.keys():
                    relId = hashVal + side
                    self.storage.createRelation(relId, schema)
                    relHashMap[hashVal] = relId

                self.storage.insertTuple(relHashMap[hashVal], tup)

        return relHashMap

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            exprs = "(expr='" + str(self.joinExpr) + "')"

        elif self.joinMethod == "indexed":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")"

        elif self.joinMethod == "hash":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + [
                    "lhsKeySchema=" + self.lhsKeySchema.toString(),
                    "rhsKeySchema=" + self.rhsKeySchema.toString(),
                    "lhsHashFn='" + self.lhsHashFn + "'",
                    "rhsHashFn='" + self.rhsHashFn + "'"
                ]))) + ")"

        return super().explain() + exprs
Beispiel #3
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    self.initializeOutput()
    self.inputFinished = False
    if not self.pipelined:
      self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    if self.pipelined:
      while not(self.inputFinished or self.isOutputPageReady()):
        try:
          pageId, page = next(self.inputIterator)
          self.processInputPage(pageId, page)
        except StopIteration:
          self.inputFinished = True

      return self.outputPage()

    else:
      return next(self.outputIterator)


 #raise NotImplementedError

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    for (lPageId, lhsPage) in iter(self.lhsPlan):
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in iter(self.rhsPlan):
          for rTuple in rhsPage:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            if eval(self.joinExpr, globals(), joinExprEnv):
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    pinnedPages = []

    M = bufPool.numPages()
    count = 0

    try:
      while count < (M-2):
        (pageId,pageObj) = next(pageIterator) 
        bufPool.pinPage(pageId)
        pinnedPages.append((pageId, pageObj))
        count += 1
    except StopIteration:
      pass

    return pinnedPages        

  def blockNestedLoops(self):
    lIter = iter(self.lhsPlan)
    pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter)
    while (len(pinnedPages) > 0):
      for (lPageId, lhsPage) in iter(pinnedPages):
        for lTuple in lhsPage:
          # Load the lhs once per inner loop.
          joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

          for (rPageId, rhsPage) in iter(self.rhsPlan):
            for rTuple in rhsPage:
              # Load the RHS tuple fields.
              joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

              # Evaluate the join predicate, and output if we have a match.
              if eval(self.joinExpr, globals(), joinExprEnv):
                outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
                self.emitOutputTuple(self.joinSchema.pack(outputTuple))

          # No need to track anything but the last output page when in batch mode.
          if self.outputPages:
            self.outputPages = [self.outputPages[-1]]
      for (pageId, pageObj) in pinnedPages:
        self.storage.bufferPool.unpinPage(pageId)
      pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter)

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Indexed nested loops implementation
  #
  # TODO: test
  def indexedNestedLoops(self):
    for (lPageId, lhsPage) in iter(self.lhsPlan):
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)
        joinKey = self.lhsKeySchema.pack(self.lhsSchema.project(self.lhsSchema.unpack(lTuple), self.lhsKeySchema))

        #matches is an iterator over tuple IDs
        matches = self.storage.fileMgr.lookupByIndex(self.rhsPlan.relationId(), self.indexId, joinKey)

        if not matches:
          continue

        for rTupleID in matches:
          rFile      = self.storage.fileMgr.relationFile(self.rhsPlan.relationId())[1]
          pId        = rTupleID.pageId
          rpage      = rFile.bufferPool.getPage(pId)
          rtupleData = rpage.getTuple(rTupleID)
          #unpack rtupleData?
          joinExprEnv.update(self.loadSchema(self.rhsSchema, rtupleData))
          if eval(self.joinExpr, globals(), joinExprEnv):
            outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
            self.emitOutputTuple(self.joinSchema.pack(outputTuple))
    return self.storage.pages(self.relationId())
 
    #raise NotImplementedError

  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    lRelIds = []
    rRelIds = []
    for (lPageId, lhsPage) in iter(self.lhsPlan):
      for lTuple in lhsPage:
        hashExprEnv = self.loadSchema(self.lhsSchema, lTuple)
        tupleHash = eval(self.lhsHashFn, globals(), hashExprEnv)
        
        relId = str(self.id()) + "l" + str(tupleHash)
        self.storage.createRelation(relId, self.lhsSchema)
        self.storage.insertTuple(relId, lTuple)
        
        if str(tupleHash) not in lRelIds:
          lRelIds.append(str(tupleHash))

    for (rPageId, rhsPage) in iter(self.rhsPlan):
      for rTuple in rhsPage:
        hashExprEnv = self.loadSchema(self.rhsSchema, rTuple)
        tupleHash = eval(self.rhsHashFn, globals(), hashExprEnv)
        
        relId = str(self.id()) + "r" + str(tupleHash)
        self.storage.createRelation(relId, self.rhsSchema)
        self.storage.insertTuple(relId, rTuple)

        if str(tupleHash) not in rRelIds:
          rRelIds.append(str(tupleHash))

    if not self.joinExpr:
      self.joinExpr = "True" 
    for k in range(len(self.lhsKeySchema.fields)):
      self.joinExpr += " and " + self.lhsKeySchema.fields[k] + " == " + self.rhsKeySchema.fields[k] 
    for lId in lRelIds:
      if lId in rRelIds:
        ######DO BNLJ#######
    
        lIter = iter(self.storage.pages(str(self.id()) + "l" + lId))
        pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter)
        while (len(pinnedPages) > 0):
          for (lPageId, lhsPage) in iter(pinnedPages):
            for lTuple in lhsPage:
              # Load the lhs once per inner loop.
              joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

              for (rPageId, rhsPage) in iter(self.storage.pages(str(self.id()) + "r" + lId)):
                for rTuple in rhsPage:
                  # Load the RHS tuple fields.
                  joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))
 
                  # Evaluate the join predicate, and output if we have a match.
                  if eval(self.joinExpr, globals(), joinExprEnv):
                    outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
                    self.emitOutputTuple(self.joinSchema.pack(outputTuple))

              # No need to track anything but the last output page when in batch mode.
              if self.outputPages:
                self.outputPages = [self.outputPages[-1]]
          for (pageId, pageObj) in pinnedPages:
            self.storage.bufferPool.unpinPage(pageId)
          pinnedPages = self.accessPageBlock(self.storage.bufferPool, lIter)

       ######END BNLJ######

    return self.storage.pages(self.relationId())


  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Beispiel #4
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    self.initializeOutput()
    # Pipelined join operator is not supported according to constructor
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")

  # Return an iterator to the output relation
  def outputRelationIterator(self):
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    self.runNestedLoops(iter(self.lhsPlan), iter(self.rhsPlan), False, False, False)
    # Return an iterator to the output relation
    return self.outputRelationIterator()

  # Common function used by all types of joins
  def runNestedLoops(self, lhsPageIter, rhsPageIter, isBlock, isIndex, isHash):
    for (lPageId, lhsPage) in lhsPageIter:
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        if isIndex:
          keyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
          idxManager = self.storage.fileMgr.indexManager
          rhsPageIter = idxManager.lookupByIndex(self.indexId, keyData)

        for rhsItem in rhsPageIter:
          rhsTupleIter = None

          if isIndex:
            # Retrieve index-matched tuple from corresponding page
            page = self.storage.bufferPool.getPage(rhsItem.pageId) # rhsItem = rhsTupId
            rhsTupleIter = [page.getTuple(rhsItem)]
          else:
            # Need to scan all tuples
            rhsTupleIter = rhsItem[1] # rhsItem = (rPageId, rhsPage)

          for rTuple in rhsTupleIter:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            validJoin = False

            if isIndex:
              validJoin = True
            else:
              if isHash:
                lhsKeyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
                rhsKeyData = self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema)
                validJoin = lhsKeyData == rhsKeyData
              else:
                validJoin = True

            if self.joinExpr:
              validJoin = validJoin and eval(self.joinExpr, globals(), joinExprEnv)

            if validJoin:
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]
      
      if isBlock:
        self.storage.bufferPool.unpinPage(lPageId)

  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    pinnedPages = list()
    try:
      while bufPool.numFreePages() > 0:
        (lPageId, lhsPage) = next(pageIterator)
        bufPool.pinPage(lPageId)
        pinnedPages.append((lPageId, lhsPage))
    except StopIteration:
      pass
    return pinnedPages

  def pinPages(self, pageIterator):
    return self.accessPageBlock(self.storage.bufferPool, pageIterator)

  def blockNestedLoops(self):
    self.runBlockNestedLoops(iter(self.lhsPlan), self.rhsPlan, False)
    return self.outputRelationIterator()

  def runBlockNestedLoops(self, lhsPageIter, rhsPageIter, isHashJoin):
    pinnedPages = self.pinPages(lhsPageIter)
    # Keep running untill ALL pages have been loaded
    # Note: 'rhsPageIter' should be 'list' type NOT 'iter'
    while len(pinnedPages) > 0:
      self.runNestedLoops(iter(pinnedPages), rhsPageIter, True, False, isHashJoin)
      pinnedPages = self.pinPages(lhsPageIter)

  ##################################
  #
  # Indexed nested loops implementation
  #
  def indexedNestedLoops(self):
    self.runNestedLoops(iter(self.lhsPlan), None, False, True, False)
    return self.outputRelationIterator()

  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    lhsRelIdMap = {}
    rhsRelIdMap = {}

    # Partition each relation using hash function
    self.partition(self.lhsPlan, self.lhsHashFn, self.lhsSchema, lhsRelIdMap, "lhs")
    self.partition(self.rhsPlan, self.rhsHashFn, self.rhsSchema, rhsRelIdMap, "rhs")

    # Perform block nested loop join for each bucket
    for hashValue, relId in lhsRelIdMap.items():
      lhsPageIter = self.storage.pages(relId)
      rhsPageIter = self.storage.pages(rhsRelIdMap[hashValue])

      self.runBlockNestedLoops(lhsPageIter, list(rhsPageIter), True)

    # Remove partitions
    partitionIter = itertools.chain(lhsRelIdMap.items(), rhsRelIdMap.items())
    for _, relId in partitionIter:
      self.storage.removeRelation(relId)

    return self.outputRelationIterator()
        
  # Partitions a given relation based on some hash function 
  def partition(self, plan, hashFn, schema, relIdMap, relPrefix):
    for (pageId, page) in iter(plan):
      for tuple in page:
        # Compute hash value for every tuple
        fieldBindings = self.loadSchema(schema, tuple)
        hashValue = eval(hashFn, globals(), fieldBindings)

        # Store in temporary buckets (files)
        if not hashValue in relIdMap:
          relId = str(self.id()) + "_" + relPrefix + "_" + str(hashValue)
          self.storage.createRelation(relId, schema)
          relIdMap[hashValue] = relId

        self.storage.insertTuple(relIdMap[hashValue], tuple)          

  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Beispiel #5
0
class GroupBy(Operator):
  def __init__(self, subPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined group-by-aggregate operator not supported")

    self.subPlan     = subPlan
    self.subSchema   = subPlan.schema()
    self.groupSchema = kwargs.get("groupSchema", None)
    self.aggSchema   = kwargs.get("aggSchema", None)
    self.groupExpr   = kwargs.get("groupExpr", None)
    self.aggExprs    = kwargs.get("aggExprs", None)
    self.groupHashFn = kwargs.get("groupHashFn", None)

    self.validateGroupBy()
    self.initializeSchema()

  # Perform some basic checking on the group-by operator's parameters.
  def validateGroupBy(self):
    requireAllValid = [self.subPlan, \
                       self.groupSchema, self.aggSchema, \
                       self.groupExpr, self.aggExprs, self.groupHashFn ]

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete group-by specification, missing a required parameter")

    if not self.aggExprs:
      raise ValueError("Group-by needs at least one aggregate expression")

    if len(self.aggExprs) != len(self.aggSchema.fields):
      raise ValueError("Invalid aggregate fields: schema mismatch")

  # Initializes the group-by's schema as a concatenation of the group-by
  # fields and all aggregate fields.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.groupSchema.schema() + self.aggSchema.schema()
    self.outputSchema = DBSchema(schema, fields)

  # Returns the output schema of this operator
  def schema(self):
    return self.outputSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.subPlan.schema()]

  # Returns a string describing the operator type
  def operatorType(self):
    return "GroupBy"

  # Returns child operators if present
  def inputs(self):
    return [self.subPlan]

  # Iterator abstraction for selection operator.
  def __iter__(self):
    self.initializeOutput()
    self.inputIterator = iter(self.subPlan)
    self.outputIterator = self.processAllPages()

    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    '''if self.inputIterator is None:
      self.inputIterator = iter(self.subPlan)

    relIds = []
    try:
      for (pageId, page) in self.inputIterator:
        for tuple in page:
          key = self.groupExpr(self.subSchema.unpack(tuple)),
          partition = self.groupHashFn(key)
          relId = "newGB_" + str(partition)

          if not self.storage.hasRelation(relId):
            self.storage.createRelation(relId, self.subSchema)
            relIds.append(relId)

          partFile = self.storage.fileMgr.relationFile(relId)[1]
          if partFile:
            partFile.insertTuple(tuple)
          #self.storage.insertTuple(relId, tuple)

    except StopIteration:
      pass

    for relId in relIds:
      partFile = self.storage.fileMgr.relationFile(relId)[1]
      groupDict = dict()
      for tuple in partFile.pages():
        currInput = self.subSchema.unpack(tuple)
        key = self.subSchema.projectBinary(tuple, self.groupSchema)

        if key not in groupDict:
          currAgg = self.aggSchema.instantiate(*[e[0] for e in self.aggExprs])
        else:
          currAgg = self.aggSchema.unpack(groupDict[key])

        groupDict[key] = self.aggSchema.pack(self.aggSchema.instantiate(\
                            *[self.aggExprs[i][1](currAgg[i], currInput)\
                              for i in range(len(self.aggExprs))]))

      for k, v in groupDict.items():
        currAgg = self.aggSchema.unpack(v)
        finalVal = self.aggSchema.pack(self.aggSchema.instantiate(\
                              *[self.aggExprs[i][2](currAgg[i]) for i in range(len(self.aggExprs))]))

        output = self.loadSchema(self.groupSchema, k)
        output.update(self.loadSchema(self.aggSchema, finalVal))
        outputTuple = self.outputSchema.instantiate(*[output[f] for f in self.outputSchema.fields])
        self.emitOutputTuple(self.outputSchema.pack(outputTuple))

    if self.outputPages:
      self.outputPages = [self.outputPages[-1]]

    return self.storage.pages(self.relationId())'''


    self.partitionFiles = {}
    for (pageId, page) in self.inputIterator:
      for tup in page:
        groupVal = self.groupExpr(self.subSchema.unpack(tup)),
        groupId = self.groupHashFn(groupVal)
        partitionRelId = "GBpartition_" + str(groupId)

        if not self.storage.hasRelation(partitionRelId):
          self.storage.createRelation(partitionRelId, self.subSchema)
          self.partitionFiles[groupId] = partitionRelId

        partFile = self.storage.fileMgr.relationFile(partitionRelId)[1]
        if partFile:
          partFile.insertTuple(tup)

    for partitionRelId in self.partitionFiles.values():
      partFile = self.storage.fileMgr.relationFile(partitionRelId)[1]

      groupDict = {}
      for (pageId, page) in partFile.pages():
        for tup in page:
          currInput = self.subSchema.unpack(tup)
          key = self.groupExpr(currInput),

          if key not in groupDict:
            groupDict[key] = self.aggSchema.instantiate(*[e[0] for e in self.aggExprs])

          groupDict[key] = self.aggSchema.instantiate(\
                            *[self.aggExprs[i][1](groupDict[key][i], currInput)\
                              for i in range(len(self.aggExprs))])

      for (groupVal, aggVals) in groupDict.items():
        finalVal = self.aggSchema.instantiate(\
                              *[self.aggExprs[i][2](aggVals[i]) for i in range(len(self.aggExprs))])
        outputTuple = self.outputSchema.instantiate(*(list(groupVal) + list(finalVal)))
        self.emitOutputTuple(self.outputSchema.pack(outputTuple))

      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    self.removePartitionFiles()

    return self.storage.pages(self.relationId())

  def removePartitionFiles(self):
    for partitionRelId in self.partitionFiles.values():
      self.storage.removeRelation(partitionRelId)
    self.partitionFiles = {}


  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                             + ", aggSchema=" + self.aggSchema.toString() + ")"
Beispiel #6
0
class Join(Operator):
    def __init__(self, lhsPlan, rhsPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError("Pipelined join operator not supported")

        self.lhsPlan = lhsPlan
        self.rhsPlan = rhsPlan
        self.joinExpr = kwargs.get("expr", None)
        self.joinMethod = kwargs.get("method", None)
        self.lhsSchema = kwargs.get(
            "lhsSchema", None if lhsPlan is None else lhsPlan.schema())
        self.rhsSchema = kwargs.get(
            "rhsSchema", None if rhsPlan is None else rhsPlan.schema())

        self.lhsKeySchema = kwargs.get("lhsKeySchema", None)
        self.rhsKeySchema = kwargs.get("rhsKeySchema", None)
        self.lhsHashFn = kwargs.get("lhsHashFn", None)
        self.rhsHashFn = kwargs.get("rhsHashFn", None)

        self.validateJoin()
        self.initializeSchema()
        self.initializeMethod(**kwargs)

        self.pidsInBlock = list()

        self.tempFileHashR = dict()
        self.outputPageHashR = dict()

        self.tempFileHashL = dict()
        self.outputPageHashL = dict()

        self.tempFile = None

    # Checks the join parameters.
    def validateJoin(self):
        # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
        if self.joinMethod not in [
                "nested-loops", "block-nested-loops", "indexed", "hash"
        ]:
            raise ValueError("Invalid join method in join operator")

        # Check all fields are valid.
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            methodParams = [self.joinExpr]

        elif self.joinMethod == "indexed":
            methodParams = [self.lhsKeySchema]

        elif self.joinMethod == "hash":
            methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                    self.rhsHashFn, self.rhsKeySchema]

        requireAllValid = [self.lhsPlan, self.rhsPlan, \
                  self.joinMethod, \
                  self.lhsSchema, self.rhsSchema ] \
                  + methodParams

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete join specification, missing join operator parameter"
            )

        # For now, we assume that the LHS and RHS schema have
        # disjoint attribute names, enforcing this here.
        for lhsAttr in self.lhsSchema.fields:
            if lhsAttr in self.rhsSchema.fields:
                raise ValueError(
                    "Invalid join inputs, overlapping schema detected")

    # Initializes the output schema for this join.
    # This is a concatenation of all fields in the lhs and rhs schema.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.lhsSchema.schema() + self.rhsSchema.schema()
        self.joinSchema = DBSchema(schema, fields)

    # Initializes any additional operator parameters based on the join method.
    def initializeMethod(self, **kwargs):
        if self.joinMethod == "indexed":
            self.indexId = kwargs.get("indexId", None)
            if self.indexId is None or self.lhsKeySchema is None:
                raise ValueError("Invalid index for use in join operator")

    # Returns the output schema of this operator
    def schema(self):
        return self.joinSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.lhsSchema, self.rhsSchema]

    # Returns a string describing the operator type
    def operatorType(self):
        readableJoinTypes = {
            'nested-loops': 'NL',
            'block-nested-loops': 'BNL',
            'indexed': 'Index',
            'hash': 'Hash'
        }
        return readableJoinTypes[self.joinMethod] + "Join"

    # Returns child operators if present
    def inputs(self):
        return [self.lhsPlan, self.rhsPlan]

    # Iterator abstraction for join operator.
    def __iter__(self):
        self.initializeOutput()
        self.inputIteratorL = iter(self.lhsPlan)
        self.inputFinished = False

        if not self.pipelined:
            if self.joinMethod == 'hash':
                self.outputIterator = self.hashJoin()
            elif self.joinMethod == 'block-nested-loops':
                self.outputIterator = self.blockNestedLoops()
            elif self.joinMethod == 'nested-loops':
                self.outputIterator = self.nestedLoops()

        return self

    def __next__(self):
        self.inputIteratorR = iter(self.rhsPlan)
        if self.pipelined:
            while not (self.inputFinished or self.isOutputPageReady()):
                try:
                    lPageId, lhsPage = next(self.inputIteratorL)
                    for lTuple in lhsPage:
                        compare(lTuple)
                        if self.outputPages:
                            self.outputPages = [self.outputPages[-1]]
                except StopIteration:
                    self.inputFinished = True
            return self.outputPage()

        else:
            return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        if self.joinMethod == "nested-loops":
            return self.nestedLoops()

        elif self.joinMethod == "block-nested-loops":
            return self.blockNestedLoops()

        elif self.joinMethod == "indexed":
            return self.indexedNestedLoops()

        elif self.joinMethod == "hash":
            return self.hashJoin()

        else:
            raise ValueError("Invalid join method in join operator")

    ##################################
    #
    # Nested loops implementation
    #
    def nestedLoops(self):
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for lTuple in lhsPage:
                # Load the lhs once per inner loop.
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, rhsPage) in iter(self.rhsPlan):
                    for rTuple in rhsPage:
                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate the join predicate, and output if we have a match.
                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                # No need to track anything but the last output page when in batch mode.
                #compare(lTuple)
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    def compare(lTuple):
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in iter(self.rhsPlan):
            for rTuple in rhsPage:
                # Load the RHS tuple fields.
                joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

                # Evaluate the join predicate, and output if we have a match.
                if eval(self.joinExpr, globals(), joinExprEnv):
                    outputTuple = self.joinSchema.instantiate(
                        *[joinExprEnv[f] for f in self.joinSchema.fields])
                    self.emitOutputTuple(self.joinSchema.pack(outputTuple))

    ##################################
    #
    # Block nested loops implementation
    #
    # This attempts to use all the free pages in the buffer pool
    # for its block of the outer relation.

    # Accesses a block of pages from an iterator.
    # This method pins pages in the buffer pool during its access.
    # We track the page ids in the block to unpin them after processing the block.
    def accessPageBlock(self, bufPool, pageIterator):
        for pid in self.pidsInBlock:
            bufPool.unpinPage(pid)
        self.pidsInBlock = list()
        M = bufPool.freeSpace()
        for i in range(0, M - 2):
            try:
                (pid, page) = next(pageIterator)
            except:
                break
            #if pid is None:
            #	break
            self.pidsInBlock.append(pid)
            bufPool.getPage(pid, pinned=True)
            bufPool.pinPage(pid)

    def blockNestedLoops(self):
        riter = iter(self.rhsPlan)
        buf = self.storage.bufferPool
        while riter.hasNext():
            self.accessPageBlock(buf, riter)
            for (lPageId, lhsPage) in iter(self.lhsPlan):
                for lTuple in lhsPage:
                    # Load the lhs once per inner loop.
                    joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                    for pid in self.pidsInBlock:
                        rhsPage = buf.getPage(pid, pinned=True)
                        for rTuple in rhsPage:
                            # Load the RHS tuple fields.
                            joinExprEnv.update(
                                self.loadSchema(self.rhsSchema, rTuple))

                            # Evaluate the join predicate, and output if we have a match.
                            if eval(self.joinExpr, globals(), joinExprEnv):
                                outputTuple = self.joinSchema.instantiate(*[
                                    joinExprEnv[f]
                                    for f in self.joinSchema.fields
                                ])
                                self.emitOutputTuple(
                                    self.joinSchema.pack(outputTuple))

                    # No need to track anything but the last output page when in batch mode.
                    if self.outputPages:
                        self.outputPages = [self.outputPages[-1]]
        self.accessPageBlock(buf, riter)
        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Indexed nested loops implementation
    #
    # TODO: test
    def indexedNestedLoops(self):
        raise NotImplementedError

    ##################################
    #
    # Hash join implementation.
    #
    def hashJoin(self):
        for (rPageId, rhsPage) in iter(self.rhsPlan):
            for tuple in rhsPage:
                val = self.loadSchema(self.rhsSchema, tuple)
                hash = eval(self.rhsHashFn, globals(), val)
                self.emitOutputTupleHash(tuple, hash, False)

        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for tuple in lhsPage:
                val = self.loadSchema(self.lhsSchema, tuple)
                hash = eval(self.lhsHashFn, globals(), val)
                self.emitOutputTupleHash(tuple, hash, True)

        evalStr = ''
        for i, lt in enumerate(self.lhsKeySchema.schema()):
            rt = self.rhsKeySchema.schema()[i]
            evalStr += str(lt[0]) + '==' + str(rt[0])
            if i != 0 and i != len(self.lhsKeySchema.schema()) - 1:
                evalStr += ' and '
        if self.joinExpr is not None:
            evalStr += ' and ' + self.joinExpr
        for lk in self.outputPageHashL.keys():
            for rk in self.outputPageHashR.keys():

                riter = iter(self.outputPageHashR[rk])
                buf = self.storage.bufferPool
                M = buf.freeSpace() - 2
                size = len(self.outputPageHashR[rk])
                while size > 0:
                    self.accessPageBlock(buf, riter)
                    size -= M
                    for (lPageId, lhsPage) in iter(self.outputPageHashL[lk]):
                        for lTuple in lhsPage:
                            # Load the lhs once per inner loop.
                            joinExprEnv = self.loadSchema(
                                self.lhsSchema, lTuple)

                            for pid in self.pidsInBlock:
                                rhsPage = buf.getPage(pid, pinned=True)
                                for rTuple in rhsPage:
                                    # Load the RHS tuple fields.
                                    joinExprEnv.update(
                                        self.loadSchema(
                                            self.rhsSchema, rTuple))

                                    # Evaluate the join predicate, and output if we have a match.
                                    if eval(evalStr, globals(), joinExprEnv):
                                        outputTuple = self.joinSchema.instantiate(
                                            *[
                                                joinExprEnv[f]
                                                for f in self.joinSchema.fields
                                            ])
                                        self.emitOutputTuple(
                                            self.joinSchema.pack(outputTuple))

                            # No need to track anything but the last output page when in batch mode.
                            if self.outputPages:
                                self.outputPages = [self.outputPages[-1]]
                self.accessPageBlock(buf, riter)
        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    def getRelId(self, hashVal, isLeft):
        tempstr = 'temp'
        if isLeft:
            tempstr = 'templ'
        return self.relationId() + tempstr + str(hashVal)

    def initializeOutputHash(self, hashVal, isLeft):
        relId = self.getRelId(hashVal, isLeft)

        if self.storage.hasRelation(relId):
            self.storage.removeRelation(relId)

        if isLeft:
            self.storage.createRelation(relId, self.lhsSchema)
            self.tempFileHashL[hashVal] = self.storage.fileMgr.relationFile(
                relId)[1]
            self.outputPageHashL[hashVal] = []
        else:
            self.storage.createRelation(relId, self.rhsSchema)
            self.tempFileHashR[hashVal] = self.storage.fileMgr.relationFile(
                relId)[1]
            self.outputPageHashR[hashVal] = []

    def emitOutputTupleHash(self, tupleData, hashVal, isLeft):
        if isLeft:
            if hashVal not in self.tempFileHashL.keys():
                self.initializeOutputHash(hashVal, isLeft)
        else:
            if hashVal not in self.tempFileHashR.keys():
                self.initializeOutputHash(hashVal, isLeft)

        self.currFile = self.tempFileHashR[hashVal]
        self.currOutputPages = self.outputPageHashR[hashVal]
        if isLeft:
            self.currFile = self.tempFileHashL[hashVal]
            self.currOutputPages = self.outputPageHashL[hashVal]

        allocatePage = not (
            self.currOutputPages and
            (self.currOutputPages)[-1][1].header.hasFreeTuple())
        if allocatePage:
            # Flush the most recently updated output page, which updates the storage file's
            # free page list to ensure correct new page allocation.
            if self.currOutputPages:
                self.storage.bufferPool.flushPage(
                    (self.currOutputPages)[-1][0])
            outputPageId = self.currFile.availablePage()
            outputPage = self.storage.bufferPool.getPage(outputPageId)
            self.currOutputPages.append((outputPageId, outputPage))
        else:
            outputPage = (self.currOutputPages)[-1][1]

        outputPage.insertTuple(tupleData)

        if self.sampled:
            self.estimatedCardinality += 1
        else:
            self.actualCardinality += 1

    def printerr(self, string):
        f = open('err.txt', 'a')
        f.write(str(string) + '\n')
        f.close()

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            exprs = "(expr='" + str(self.joinExpr) + "')"

        elif self.joinMethod == "indexed":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")"

        elif self.joinMethod == "hash":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + [
                    "lhsKeySchema=" + self.lhsKeySchema.toString(),
                    "rhsKeySchema=" + self.rhsKeySchema.toString(),
                    "lhsHashFn='" + self.lhsHashFn + "'",
                    "rhsHashFn='" + self.rhsHashFn + "'"
                ]))) + ")"

        return super().explain() + exprs
Beispiel #7
0
class GroupBy(Operator):
  def __init__(self, subPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined group-by-aggregate operator not supported")

    self.subPlan     = subPlan
    self.subSchema   = subPlan.schema()
    self.groupSchema = kwargs.get("groupSchema", None)
    self.aggSchema   = kwargs.get("aggSchema", None)
    self.groupExpr   = kwargs.get("groupExpr", None)
    self.aggExprs    = kwargs.get("aggExprs", None)
    self.groupHashFn = kwargs.get("groupHashFn", None)

    self.validateGroupBy()
    self.initializeSchema()

  # Perform some basic checking on the group-by operator's parameters.
  def validateGroupBy(self):
    requireAllValid = [self.subPlan, \
                       self.groupSchema, self.aggSchema, \
                       self.groupExpr, self.aggExprs, self.groupHashFn ]

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete group-by specification, missing a required parameter")

    if not self.aggExprs:
      raise ValueError("Group-by needs at least one aggregate expression")

    if len(self.aggExprs) != len(self.aggSchema.fields):
      raise ValueError("Invalid aggregate fields: schema mismatch")

  # Initializes the group-by's schema as a concatenation of the group-by
  # fields and all aggregate fields.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.groupSchema.schema() + self.aggSchema.schema()
    self.outputSchema = DBSchema(schema, fields)

  # Returns the output schema of this operator
  def schema(self):
    return self.outputSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.subPlan.schema()]

  # Returns a string describing the operator type
  def operatorType(self):
    return "GroupBy"

  # Returns child operators if present
  def inputs(self):
    return [self.subPlan]

  # Iterator abstraction for selection operator.
  def __iter__(self):
    self.initializeOutput()
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    relIdMap = {}

    # Perform partition using hash function
    self.partition(relIdMap)

    # Perform group-by operation
    for hashValue, relId in relIdMap.items():
      pageIter = self.storage.pages(relId)

      aggregationResults = {} # Stores intermediate aggregation results

      for _, page in pageIter:
        for tupleP in page:
          tupleU = self.subSchema.unpack(tupleP)
          gbVal = self.getGroupByValue(tupleU)

          # Get intermediate results for this group-by value
          intermediateResults = aggregationResults.get(gbVal, None)

          if intermediateResults is None:
            intermediateResults = list()
            aggregationResults[gbVal] = intermediateResults
            for aggExpr in self.aggExprs:
              # Form a list of initial values
              intermediateResults.append(aggExpr[0])
          
          idx = 0
          for aggExpr in self.aggExprs:
            # Perform aggregation by applying the lambda function (aggExpr[1])
            intermediateResult = intermediateResults[idx]
            intermediateResults[idx] = aggExpr[1](intermediateResult, tupleU)
            idx = idx + 1

      for gbVal, intermediateResults in aggregationResults.items():
        idx = 0
        for aggExpr in self.aggExprs:
          # Perform final step by applying the lambda function (aggExpr[2])
          intermediateResult = intermediateResults[idx]
          intermediateResults[idx] = aggExpr[2](intermediateResult)
          idx = idx + 1

        outputList = itertools.chain([gbVal[0]], intermediateResults)
        outputTuple = self.outputSchema.instantiate(*outputList)
        self.emitOutputTuple(self.outputSchema.pack(outputTuple))

      # No need to track anything but the last output page when in batch mode.
      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    # Remove partitions
    for _, relId in relIdMap.items():
      self.storage.removeRelation(relId)

    return self.storage.pages(self.relationId())

  # Partitions a given relation based on some hash function 
  def partition(self, relIdMap):
    for (pageId, page) in iter(self.subPlan):
      for tupleP in page:
        # Compute hash value for every tuple
        tupleU = self.subSchema.unpack(tupleP)
        hashVal = self.groupHashFn(self.getGroupByValue(tupleU))

        # Store in temporary buckets (files)
        if not hashVal in relIdMap:
          relId = str(self.id()) + "_grp_" + str(hashVal)
          self.storage.createRelation(relId, self.subSchema)
          relIdMap[hashVal] = relId

        self.storage.insertTuple(relIdMap[hashVal], tupleP)

  def getGroupByValue(self, unpackedTuple):
    gbVal = self.groupExpr(unpackedTuple)
    return gbVal if type(gbVal) is tuple else gbVal,

  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                             + ", aggSchema=" + self.aggSchema.toString() + ")"
Beispiel #8
0
bp.setFileManager(fm)

fm.createRelation(schema.name, schema)

(fId, f) = fm.relationFile(schema.name)

f.numPages() == 0

pId  = PageId(fId, 0)
pId1 = PageId(fId, 1)

p    = Page(pageId=pId,  buffer=bytes(f.pageSize()), schema=schema)
p1   = Page(pageId=pId1, buffer=bytes(f.pageSize()), schema=schema)

for tup in [schema.pack(schema.instantiate(i, 2*i+20)) for i in range(10)]:
  _ = p.insertTuple(tup)

for tup in [schema.pack(schema.instantiate(i, i+20)) for i in range(10, 20)]:
  _ = p1.insertTuple(tup)

f.writePage(p)
f.writePage(p1)
print(p.header.usedSpace())
h1 = f.readPageHeader( pId );
print(h1)
print(h1.tupleSize)
print(h1.freeSpaceOffset)
print(h1.pageCapacity)
print(h1.usedSpace())
print(f.numPages() == 2)
Beispiel #9
0
class GroupBy(Operator):
  def __init__(self, subPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined group-by-aggregate operator not supported")

    self.subPlan     = subPlan
    self.subSchema   = subPlan.schema()
    self.groupSchema = kwargs.get("groupSchema", None)
    self.aggSchema   = kwargs.get("aggSchema", None)
    self.groupExpr   = kwargs.get("groupExpr", None)
    self.aggExprs    = kwargs.get("aggExprs", None)
    self.groupHashFn = kwargs.get("groupHashFn", None)

    self.validateGroupBy()
    self.initializeSchema()

   
  def localCost(self, estimated):
    tupleSize = self.subPlan.schema().size
    numTuples = self.subPlan.cardinality(estimated)
    pageSize = self.storage.bufferPool.pageSize
    numPages = (tupleSize * numTuples) // pageSize
 
    return 2 * numTuples * self.tupleCost
    #return 2 * numPages #derived from: http://www4.comp.polyu.edu.hk/~csmlyiu/conf/CIKM09_skygroup.pdf with the assumption that G=1 and therefore the log value will be close to 1


  # Perform some basic checking on the group-by operator's parameters.
  def validateGroupBy(self):
    requireAllValid = [self.subPlan, \
                       self.groupSchema, self.aggSchema, \
                       self.groupExpr, self.aggExprs, self.groupHashFn ]

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete group-by specification, missing a required parameter")

    if not self.aggExprs:
      raise ValueError("Group-by needs at least one aggregate expression")

    if len(self.aggExprs) != len(self.aggSchema.fields):
      raise ValueError("Invalid aggregate fields: schema mismatch")

  # Initializes the group-by's schema as a concatenation of the group-by
  # fields and all aggregate fields.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.groupSchema.schema() + self.aggSchema.schema()
    self.outputSchema = DBSchema(schema, fields)

  # Returns the output schema of this operator
  def schema(self):
    return self.outputSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.subPlan.schema()]

  # Returns a string describing the operator type
  def operatorType(self):
    return "GroupBy"

  # Returns child operators if present
  def inputs(self):
    return [self.subPlan]

  # Iterator abstraction for selection operator.
  def __iter__(self):
    self.initializeOutput()
    self.partitionFiles = {}
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)


  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Processing helpers
  def ensureTuple(self, x):
    if not isinstance(x, tuple):
      return (x,)
    else:
      return x

  def initialExprs(self):
    return [i[0] for i in self.aggExprs]

  def incrExprs(self):
    return [i[1] for i in self.aggExprs]

  def finalizeExprs(self):
    return [i[2] for i in self.aggExprs]

  # Set-at-a-time operator processing
  def processAllPages(self):
    # Create partitions of the input records by hashing the group-by values
    for (pageId, page) in self.subPlan:
      for tup in page:
        groupVal = self.ensureTuple(self.groupExpr(self.subSchema.unpack(tup)))
        groupId = self.groupHashFn(groupVal)
        self.emitPartitionTuple(groupId, tup)

    # We assume that the partitions fit in main memory.
    for partRelId in self.partitionFiles.values():
      partFile = self.storage.fileMgr.relationFile(partRelId)[1]

      # Use an in-memory Python dict to accumulate the aggregates.
      aggregates = {}
      for (pageId, page) in partFile.pages():
        for tup in page:
          # Evaluate group-by value.
          namedTup = self.subSchema.unpack(tup)
          groupVal = self.ensureTuple(self.groupExpr(namedTup))

          # Look up the aggregate for the group.
          if groupVal not in aggregates:
            aggregates[groupVal] = self.initialExprs()

          # Increment the aggregate.
          aggregates[groupVal] = \
            list(map( \
              lambda x: x[0](x[1], namedTup), \
              zip(self.incrExprs(), aggregates[groupVal])))

      # Finalize the aggregate value for each group.
      for (groupVal, aggVals) in aggregates.items():
        finalVals = list(map(lambda x: x[0](x[1]), zip(self.finalizeExprs(), aggVals)))
        outputTuple = self.outputSchema.instantiate(*(list(groupVal) + finalVals))
        self.emitOutputTuple(self.outputSchema.pack(outputTuple))

      # No need to track anything but the last output page when in batch mode.
      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    # Clean up partitions.
    self.removePartitionFiles()

    # Return an iterator for the output file.
    return self.storage.pages(self.relationId())

  # Bucket construction helpers.
  def partitionRelationId(self, partitionId):
    return self.operatorType() + str(self.id()) + "_" \
            + "part_" + str(partitionId)

  def emitPartitionTuple(self, partitionId, partitionTuple):
    partRelId  = self.partitionRelationId(partitionId)

    # Create a partition file as needed.
    if not self.storage.hasRelation(partRelId):
      self.storage.createRelation(partRelId, self.subSchema)
      self.partitionFiles[partitionId] = partRelId

    partFile = self.storage.fileMgr.relationFile(partRelId)[1]
    if partFile:
      partFile.insertTuple(partitionTuple)

  # Delete all existing partition files.
  def removePartitionFiles(self):
    for partRelId in self.partitionFiles.values():
      self.storage.removeRelation(partRelId)
    self.partitionFiles = {}


  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                             + ", aggSchema=" + self.aggSchema.toString() + ")"
Beispiel #10
0
class GroupBy(Operator):
    def __init__(self, subPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError(
                "Pipelined group-by-aggregate operator not supported")

        self.subPlan = subPlan
        self.subSchema = subPlan.schema()
        self.groupSchema = kwargs.get("groupSchema", None)
        self.aggSchema = kwargs.get("aggSchema", None)
        self.groupExpr = kwargs.get("groupExpr", None)
        self.aggExprs = kwargs.get("aggExprs", None)
        self.groupHashFn = kwargs.get("groupHashFn", None)

        self.validateGroupBy()
        self.initializeSchema()

        self.tempFileHash = dict()
        self.outputPageHash = dict()

        self.tempFile = None

        #self.outputSchema = self.aggSchema

    # Perform some basic checking on the group-by operator's parameters.
    def validateGroupBy(self):
        requireAllValid = [self.subPlan, \
                  self.groupSchema, self.aggSchema, \
                  self.groupExpr, self.aggExprs, self.groupHashFn ]

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete group-by specification, missing a required parameter"
            )

        if not self.aggExprs:
            raise ValueError(
                "Group-by needs at least one aggregate expression")

        if len(self.aggExprs) != len(self.aggSchema.fields):
            raise ValueError("Invalid aggregate fields: schema mismatch")

    # Initializes the group-by's schema as a concatenation of the group-by
    # fields and all aggregate fields.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        self.outputSchema = DBSchema(schema, fields)

    # Returns the output schema of this operator
    def schema(self):
        return self.outputSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.subPlan.schema()]

    # Returns a string describing the operator type
    def operatorType(self):
        return "GroupBy"

    # Returns child operators if present
    def inputs(self):
        return [self.subPlan]

    # Iterator abstraction for selection operator.
    def __iter__(self):
        self.iterator = iter(self.subPlan)
        self.acc = dict()
        self.outputIterator = self.processAllPages()
        return self

    def __next__(self):
        if self.pipelined:
            return self.outputPage()

        else:
            return next(self.outputIterator)
        raise StopIteration
        '''
		(PageId, Page) = next(self.iterator)
		
			#self.printerr('memes')
			#self.printerr(self.schema().schema())
			#for k in self.acc.keys():
			#	self.emitOutputTuple(self.aggExprs[2](self.acc[k]))
			#raise StopIteration
		
		for Tuple in Page:
			# Load the lhs once per inner loop.
			val = self.loadSchema(self.subSchema, Tuple)
			
			temp = namedtuple('temp',val.keys())
			
			l = list()
			for k in val.keys():
				l.append(val[k])
			
			ntup = temp._make(l)
			
			expr = self.groupExpr(ntup)
			#self.printerr(expr)
			hash = self.groupHashFn((expr,0))
			#self.printerr(hash)
			#if hash not in self.acc.keys():
			#	self.acc[hash] = self.aggExprs[0]
			#self.acc[hash] = self.aggExprs[1](self.acc[hash],val)
			
			#ids = []
			#for tup in self.groupSchema.schema():
			#	ids.append(tup[0])
			#temp = namedtuple('temp',ids)
			#l = list()
			#for k in ids:
			#	l.append(val[k])
			#ntup = temp._make(l)
			#self.printerr(hash)
			self.emitOutputTupleHash(ntup, hash)'''

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        self.initializeSchema()
        self.acc = dict()
        for (PageId, Page) in iter(self.subPlan):
            for Tuple in Page:
                # Load the lhs once per inner loop.
                val = self.loadSchema(self.subSchema, Tuple)

                ntup = self.subSchema.instantiate(
                    *[val[f] for f in self.subSchema.fields])

                expr = self.groupExpr(ntup)
                #self.printerr(expr)
                hash = self.groupHashFn((expr, 0))

                self.emitOutputTupleHash(Tuple, hash)
        for k in self.outputPageHash.keys():
            acc = dict()
            for i, outSchema in enumerate(self.aggSchema.schema()):
                acc[outSchema[0]] = self.aggExprs[i][0]
            for pinfo in self.outputPageHash[k]:
                page = self.storage.bufferPool.getPage(pinfo[0])
                for tup in page:
                    val = self.loadSchema(self.subSchema, tup)

                    temp = namedtuple('temp', val.keys())
                    l = list()
                    for k in val.keys():
                        l.append(val[k])
                    ntup = temp._make(l)
                    for i, outSchema in enumerate(self.groupSchema.schema()):
                        acc[outSchema[0]] = self.groupExpr(ntup)
                    for i, outSchema in enumerate(self.aggSchema.schema()):
                        acc[outSchema[0]] = self.aggExprs[i][2](
                            self.aggExprs[i][1](acc[outSchema[0]], ntup))
            outputTuple = self.outputSchema.instantiate(
                *[acc[f] for f in self.outputSchema.fields])
            #self.printerr(outputTuple)
            self.emitOutputTuple(self.outputSchema.pack(outputTuple))
            #if self.outputPages:
            #self.outputPages = [self.outputPages[-1]]

        return self.storage.pages(self.relationId())

    def getRelId(self, hashVal):
        return self.relationId() + 'temp' + str(hashVal)

    def initializeOutputHash(self, hashVal):
        relId = self.getRelId(hashVal)

        if self.storage.hasRelation(relId):
            self.storage.removeRelation(relId)

        self.storage.createRelation(relId, self.subSchema)
        self.tempFileHash[hashVal] = self.storage.fileMgr.relationFile(
            relId)[1]
        self.outputPageHash[hashVal] = []

    def emitOutputTupleHash(self, tupleData, hashVal):
        if hashVal not in self.tempFileHash.keys():
            self.initializeOutputHash(hashVal)

        self.currFile = self.tempFileHash[hashVal]
        self.currOutputPages = self.outputPageHash[hashVal]

        allocatePage = not (
            self.outputPageHash[hashVal] and
            (self.outputPageHash[hashVal])[-1][1].header.hasFreeTuple())
        if allocatePage:
            # Flush the most recently updated output page, which updates the storage file's
            # free page list to ensure correct new page allocation.
            if self.outputPageHash[hashVal]:
                self.storage.bufferPool.flushPage(
                    (self.outputPageHash[hashVal])[-1][0])
            outputPageId = self.currFile.availablePage()
            outputPage = self.storage.bufferPool.getPage(outputPageId)
            self.outputPageHash[hashVal].append((outputPageId, outputPage))
        else:
            outputPage = (self.outputPageHash[hashVal])[-1][1]

        outputPage.insertTuple(tupleData)

        #if self.sampled:
        #	self.estimatedCardinality += 1
        #else:
        #	self.actualCardinality += 1

    def printerr(self, string):
        f = open('err.txt', 'a')
        f.write(str(string) + '\n')
        f.close()

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                     + ", aggSchema=" + self.aggSchema.toString() + ")"
Beispiel #11
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())
    
    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]
    
    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema] 
    
    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]
    
    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join. 
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None \
          or self.storage.getIndex(self.indexId) is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL' 
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    self.initializeOutput();
    return self.processAllPages();

  def __next__(self):
    return next(self.storage.pages(self.relationId()));

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()
    
    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()
    
    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    for (lPageId, lhsPage) in iter(self.lhsPlan):
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in iter(self.rhsPlan):
          for rTuple in rhsPage:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            if eval(self.joinExpr, globals(), joinExprEnv):
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())



  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  def blockNestedLoops(self):
    bufPool = self.storage.bufferPool;
    lSchema = self.inputSchemas()[0];
    rSchema = self.inputSchemas()[1];
    
    lhsKey = self.joinExpr.split('==')[0].strip();
    rhsKey = self.joinExpr.split('==')[1].strip();

    self.cleanBufferPool(bufPool);
    
    self.logger("starting...")
    for pageBlock in self.accessPageBlock(bufPool, iter(self.lhsPlan)):
      self.logger("one new pageBlock...");
      hasher = dict();
    
      for lPageId in pageBlock:
        lhsPage = bufPool.getPage(lPageId);
        for lTuple in iter(lhsPage):
          tupleObj = lSchema.unpack(lTuple);
          key = getattr(tupleObj, lhsKey);
          if key in hasher:
            hasher[key].append(lTuple);
          else:
            hasher[key] = [lTuple];
      
      for (rPageId, rhsPage) in iter(self.rhsPlan):
        for rTuple in iter(rhsPage):
          tupleObj = rSchema.unpack(rTuple);
          key = getattr(tupleObj, rhsKey);
          if key in hasher:
            joinExprEnv = self.loadSchema(rSchema, rTuple);
            for lTuple in hasher[key]:
              joinExprEnv.update(self.loadSchema(lSchema, lTuple));
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]);
              outputTupleP = self.joinSchema.pack(outputTuple);
              self.storage.fileMgr.relationFile(self.relationId())[1].insertTuple(outputTupleP); 
        
      for lPageId in pageBlock:
        self.storage.bufferPool.unpinPage(lPageId);
        self.storage.bufferPool.discardPage(lPageId);
      
      self.cleanBufferPool(bufPool);
      del hasher;
      
      self.logger("ending...");
      
    return self.storage.pages(self.relationId());

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
      
    self.cleanBufferPool( bufPool );
    pageBlock = [];
    self.inputFinished = False;
    while not(self.inputFinished):
      try:
        (pageId, page) = next(pageIterator);
        if (bufPool.numFreePages() > 2):
          _ = bufPool.getPage(pageId);
          bufPool.pinPage(pageId);
          pageBlock.append(pageId);
        else:
          yield pageBlock;
          pageBlock = [];
      except StopIteration:
        self.inputFinished = True;
        yield pageBlock;
      
  ##################################
  #
  # Indexed nested loops implementation
  #
  def indexedNestedLoops(self):
    raise NotImplementedError

  ##################################
  #
  # Some helper function
  #
  # clean buffer pool before use
  def cleanBufferPool(self, bufPool):

    # evict out clean pages and flush dirty pages
    for (pageId, (_, page, pinCount)) in bufPool.pageMap.items():
      if not(pinCount == 0):
        raise RuntimeError("Unable to clean bufferpool. Memory leaks?");
      else:
        if (page.isDirty()):
          # evict with flush
          bufPool.flushPage( pageId );
        # evict without flush
        bufPool.discardPage( pageId );
  
  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    if self.joinExpr == None:
      self.joinExpr = self.lhsKeySchema.fields[0] + "==" + self.rhsKeySchema.fields[0];
    
    self.tmpFilesL = list();
    self.tmpFilesR = list();
    bufPool        = self.storage.bufferPool;
    
    self.logger("start...");
    self.cleanBufferPool(bufPool);
    
    tmpFilesL = dict();
    tmpFilesR = dict();
    
    self.logger("building L partition");
    for (PageId, Page) in iter(self.lhsPlan):
      self.buildPartitionL(PageId, Page, tmpFilesL);
    
    self.logger("building R partition");
    for (PageId, Page) in iter(self.rhsPlan):
      self.buildPartitionR(PageId, Page, tmpFilesR);
      
    # Schema prep
    lSchema = self.inputSchemas()[0];
    rSchema = self.inputSchemas()[1];
      
    for relIdLKey in tmpFilesL.keys():
       
      # Clean up before running.  
      if relIdLKey in tmpFilesR:
          (_, relIdTmpR) = tmpFilesR[ relIdLKey ];
          (_, relIdTmpL) = tmpFilesL[ relIdLKey ];
      else:
          continue;
      
      self.cleanBufferPool( bufPool );
      
      lhsPlan = TableScan(relIdTmpL, self.inputSchemas()[0]);
      rhsPlan = TableScan(relIdTmpR, self.inputSchemas()[1]);
        
      lhsPlan.storage = self.storage;
      rhsPlan.storage = self.storage;
      
      self.lhsPlan = lhsPlan;
      self.rhsPlan = rhsPlan;
      
         for lPageId in pageBlock:
            lhsPage = bufPool.getPage(lPageId);
            for ltuple in iter( lhsPage ):
              tupleObj = lSchema.unpack( ltuple );
              key      = lSchema.project( tupleObj, self.lhsKeySchema )[0];
              if key in hasher:
                hasher[ key ].append( ltuple );
              else:
                hasher[ key ] = [ ltuple ];

          # iterating all rtuples to pack output
          for (rPageId, rhsPage) in iter(rhsPlan):
            print( rPageId.pageIndex );
            for rTuple in iter( rhsPage ):
              tupleObj = rSchema.unpack( rTuple );
              print( tupleObj );
              key      = rSchema.project( tupleObj, self.rhsKeySchema )[0];
              if key in hasher:
                for lTuple in hasher[ key ]:
                  joinIns = self.loadSchema( lSchema, lTuple )
                  joinIns.update( self.loadSchema( rSchema, rTuple ) );
                  outputTuple = self.joinSchema.instantiate(*[joinIns[f] for f in self.joinSchema.fields]);
                  print( outputTuple );
                  outputTupleP = self.joinSchema.pack(outputTuple);
                  self.storage.fileMgr.relationFile(self.relationId())[1].insertTuple(outputTupleP);
                  
          for lPageId in pageBlock:
            bufPool.unpinPage(lPageId);
            bufPool.discardPage(lPageId);
          
          self.cleanBufferPool(bufPool);
          del hasher;
          
      _ = self.blockNestedLoops();  
      
      self.storage.removeRelation(relIdTmpL);
      self.storage.removeRelation(relIdTmpR);
Beispiel #12
0
db = Database.Database()
deptSchema = DBSchema('department', [('d_id', 'int'), ('d_name', 'char(30)')]);
emplSchema = DBSchema('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')])
projSchema = DBSchema('project', [('p_id','int'), ('p_name', 'char(30)')])
gratSchema = DBSchema('grant', [('g_id','int'), ('g_projectid', 'int'), ('g_source', 'char(30)')])
synSchema1 = DBSchema('syn1', [('a','int'), ('b', 'char(30)')])
synSchema2 = DBSchema('syn2', [('c','int'), ('d', 'char(30)'), ('e','int')])

db.createRelation('department', [('d_id', 'int'), ('d_name', 'char(30)')])
db.createRelation('employee', [('e_id', 'int'), ('e_name', 'char(30)'), ('e_projectid', 'int')])
db.createRelation('project', [('p_id','int'), ('p_name', 'char(30)')])
db.createRelation('grant', [('g_id','int'), ('g_projectid', 'int'), ('g_source', 'char(30)')])
db.createRelation('syn1', [('a','int'), ('b', 'char(30)')]);
db.createRelation('syn2', [('c','int'), ('d', 'char(30)'), ('e','int')]);

for tup in [deptSchema.pack(deptSchema.instantiate(i, "Nature"+str(i))) for i in range(4000)]:
  _ = db.insertTuple('department', tup);
for tup in [deptSchema.pack(deptSchema.instantiate(i, "Science"+str(i))) for i in range(4000, 8000)]:
  _ = db.insertTuple('department', tup);
ename = ["John", "Mike", "Davis", "Alex"];
for tup in [emplSchema.pack(emplSchema.instantiate(i, ename[i%4], i%10)) for i in range(8000)]:
  _ = db.insertTuple('employee', tup);
projectName = ["CS","EE","Biophysics","Biostats","NeuroScience", "Cell Biology"];
for tup in [projSchema.pack(projSchema.instantiate(i, projectName[i%6])) for i in range(8000)]:
  _ = db.insertTuple('project', tup);
sourceName = ["NIH","NSF","Apple","Microsoft","Google"];
for tup in [gratSchema.pack(gratSchema.instantiate(i, i%2000, sourceName[i%5])) for i in range(8000)]:
  _ = db.insertTuple('grant', tup);
for tup in [synSchema1.pack(synSchema1.instantiate(i, sourceName[i%3])) for i in range(8000)]:
  _ = db.insertTuple('syn1', tup);
for tup in [synSchema2.pack(synSchema2.instantiate(i, sourceName[i%5], i%500)) for i in range(8000)]:
Beispiel #13
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    raise NotImplementedError

  def __next__(self):
    raise NotImplementedError

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    for (lPageId, lhsPage) in iter(self.lhsPlan):
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in iter(self.rhsPlan):
          for rTuple in rhsPage:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            if eval(self.joinExpr, globals(), joinExprEnv):
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    raise NotImplementedError

  def blockNestedLoops(self):
    raise NotImplementedError


  ##################################
  #
  # Indexed nested loops implementation
  #
  # TODO: test
  def indexedNestedLoops(self):
    raise NotImplementedError

  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    raise NotImplementedError

  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Beispiel #14
0
class Join(Operator):
    def __init__(self, lhsPlan, rhsPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError("Pipelined join operator not supported")

        self.lhsPlan = lhsPlan
        self.rhsPlan = rhsPlan
        self.joinExpr = kwargs.get("expr", None)
        self.joinMethod = kwargs.get("method", None)
        self.lhsSchema = kwargs.get(
            "lhsSchema", None if lhsPlan is None else lhsPlan.schema())
        self.rhsSchema = kwargs.get(
            "rhsSchema", None if rhsPlan is None else rhsPlan.schema())

        self.lhsKeySchema = kwargs.get("lhsKeySchema", None)
        self.rhsKeySchema = kwargs.get("rhsKeySchema", None)
        self.lhsHashFn = kwargs.get("lhsHashFn", None)
        self.rhsHashFn = kwargs.get("rhsHashFn", None)

        self.validateJoin()
        self.initializeSchema()
        self.initializeMethod(**kwargs)

    # Checks the join parameters.
    def validateJoin(self):
        # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
        if self.joinMethod not in [
                "nested-loops", "block-nested-loops", "indexed", "hash"
        ]:
            raise ValueError("Invalid join method in join operator")

        # Check all fields are valid.
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            methodParams = [self.joinExpr]

        elif self.joinMethod == "indexed":
            methodParams = [self.lhsKeySchema]

        elif self.joinMethod == "hash":
            methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                            self.rhsHashFn, self.rhsKeySchema]

        requireAllValid = [self.lhsPlan, self.rhsPlan, \
                           self.joinMethod, \
                           self.lhsSchema, self.rhsSchema ] \
                           + methodParams

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete join specification, missing join operator parameter"
            )

        # For now, we assume that the LHS and RHS schema have
        # disjoint attribute names, enforcing this here.
        for lhsAttr in self.lhsSchema.fields:
            if lhsAttr in self.rhsSchema.fields:
                raise ValueError(
                    "Invalid join inputs, overlapping schema detected")

    # Initializes the output schema for this join.
    # This is a concatenation of all fields in the lhs and rhs schema.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.lhsSchema.schema() + self.rhsSchema.schema()
        self.joinSchema = DBSchema(schema, fields)

    # Initializes any additional operator parameters based on the join method.
    def initializeMethod(self, **kwargs):
        if self.joinMethod == "indexed":
            self.indexId = kwargs.get("indexId", None)
            if self.indexId is None or self.lhsKeySchema is None \
                or self.storage.getIndex(self.indexId) is None:
                raise ValueError("Invalid index for use in join operator")

    # Returns the output schema of this operator
    def schema(self):
        return self.joinSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.lhsSchema, self.rhsSchema]

    # Returns a string describing the operator type
    def operatorType(self):
        readableJoinTypes = {
            'nested-loops': 'NL',
            'block-nested-loops': 'BNL',
            'indexed': 'Index',
            'hash': 'Hash'
        }
        return readableJoinTypes[self.joinMethod] + "Join"

    # Returns child operators if present
    def inputs(self):
        return [self.lhsPlan, self.rhsPlan]

    # Iterator abstraction for join operator.
    def __iter__(self):

        relId = self.relationId()

        if self.storage.hasRelation(relId):
            return self.storage.pages(relId)

        self.initializeOutput()
        self.partitionFiles = {0: {}, 1: {}}
        self.outputIterator = self.processAllPages()
        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        if self.joinMethod == "nested-loops":
            return self.nestedLoops()

        elif self.joinMethod == "block-nested-loops":
            return self.blockNestedLoops()

        elif self.joinMethod == "indexed":
            return self.indexedNestedLoops()

        elif self.joinMethod == "hash":
            return self.hashJoin()

        else:
            raise ValueError("Invalid join method in join operator")

    ##################################
    #
    # Nested loops implementation
    #
    def nestedLoops(self):
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for lTuple in lhsPage:
                # Load the lhs once per inner loop.
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, rhsPage) in iter(self.rhsPlan):
                    for rTuple in rhsPage:
                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate the join predicate, and output if we have a match.
                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                # No need to track anything but the last output page when in batch mode.
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Block nested loops implementation
    #
    # This attempts to use all the free pages in the buffer pool
    # for its block of the outer relation.

    # Accesses a block of pages from an iterator.
    # This method pins pages in the buffer pool during its access.
    # We track the page ids in the block to unpin them after processing the block.
    def accessPageBlock(self, bufPool, pageIterator):
        pageBlock = []
        try:
            while True:
                (pageId, page) = next(pageIterator)
                pageBlock.append((pageId, page))
                bufPool.pinPage(pageId)
                if bufPool.numFreePages() == 0:
                    break
        except StopIteration:
            pass

        return pageBlock

    def blockNestedLoops(self):
        # Access the outer relation's block, pinning pages in the buffer pool.
        bufPool = self.storage.bufferPool
        lhsIter = iter(self.lhsPlan)
        lPageBlock = self.accessPageBlock(bufPool, lhsIter)

        while lPageBlock:
            for (lPageId, lhsPage) in lPageBlock:
                for lTuple in lhsPage:
                    # Load the lhs once per inner loop.
                    joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                    for (rPageId, rhsPage) in iter(self.rhsPlan):
                        for rTuple in rhsPage:
                            # Load the RHS tuple fields.
                            joinExprEnv.update(
                                self.loadSchema(self.rhsSchema, rTuple))

                            # Evaluate the join predicate, and output if we have a match.
                            if eval(self.joinExpr, globals(), joinExprEnv):
                                outputTuple = self.joinSchema.instantiate(*[
                                    joinExprEnv[f]
                                    for f in self.joinSchema.fields
                                ])
                                self.emitOutputTuple(
                                    self.joinSchema.pack(outputTuple))

                    # No need to track anything but the last output page when in batch mode.
                    if self.outputPages:
                        self.outputPages = [self.outputPages[-1]]

                # Unpin the page after joining with the RHS relation.
                # Thus future accesses can evict the page while reading the next block.
                bufPool.unpinPage(lPageId)

            # Move to the next page block after processing it.
            lPageBlock = self.accessPageBlock(bufPool, lhsIter)

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Indexed nested loops implementation
    #
    # TODO: test
    def indexedNestedLoops(self):
        if self.indexId:
            bufPool = self.storage.bufPool
            for (lPageId, lhsPage) in iter(self.lhsPlan):
                for lTuple in lhsPage:
                    # Load the lhs once per inner loop.
                    joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                    # Match against RHS tuples using the index.
                    joinKey = self.lhsSchema.projectBinary(
                        lTuple, self.lhsKeySchema)
                    matches = self.storage.lookupByIndex(self.indexId, joinKey)

                    for rhsTupId in matches:
                        rhsPage = bufPool.getPage(rhsTupId.pageId)
                        rTuple = rhsPage.getTuple(rhsTupId)

                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate any remaining join predicate, and output if we have a match.
                        fullMatch = eval(
                            self.joinExpr, globals(),
                            joinExprEnv) if self.joinExpr else True
                        if fullMatch:
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                    # No need to track anything but the last output page when in batch mode.
                    if self.outputPages:
                        self.outputPages = [self.outputPages[-1]]

            # Return an iterator to the output relation
            return self.storage.pages(self.relationId())

        else:
            raise ValueError(
                "No index found while using an indexed nested loops join")

    ##################################
    #
    # Hash join implementation.
    #
    def hashJoin(self):
        # Partition the LHS and RHS inputs, creating a temporary file for each partition.
        # We assume one-level of partitioning is sufficient and skip recurring.
        for (lPageId, lPage) in iter(self.lhsPlan):
            for lTuple in lPage:
                lPartEnv = self.loadSchema(self.lhsSchema, lTuple)
                lPartKey = eval(self.lhsHashFn, globals(), lPartEnv)
                self.emitPartitionTuple(lPartKey, lTuple, left=True)

        for (rPageId, rPage) in iter(self.rhsPlan):
            for rTuple in rPage:
                rPartEnv = self.loadSchema(self.rhsSchema, rTuple)
                rPartKey = eval(self.rhsHashFn, globals(), rPartEnv)
                self.emitPartitionTuple(rPartKey, rTuple, left=False)

        # Iterate over partition pairs and output matches
        # evaluating the join expression as necessary.
        for ((lPageId, lPage), (rPageId, rPage)) in self.partitionPairs():
            for lTuple in lPage:
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)
                for rTuple in rPage:
                    joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))
                    output = \
                      ( self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) \
                          == self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) ) \
                      and ( eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True )

                    if output:
                        outputTuple = self.joinSchema.instantiate(
                            *[joinExprEnv[f] for f in self.joinSchema.fields])
                        self.emitOutputTuple(self.joinSchema.pack(outputTuple))

            # No need to track anything but the last output page when in batch mode.
            if self.outputPages:
                self.outputPages = [self.outputPages[-1]]

        # Clean up partitions.
        self.removePartitionFiles()
        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    # Hash join helpers.
    def partitionRelationId(self, left, partitionId):
        return self.operatorType() + str(self.id()) + "_" \
                + ("l" if left else "r") + "part_" + str(partitionId) + str(self.opMarker)

    def emitPartitionTuple(self, partitionId, partitionTuple, left=False):
        partRelId = self.partitionRelationId(left, partitionId)
        partSchema = self.lhsSchema if left else self.rhsSchema

        # Create a partition file as needed.
        if not self.storage.hasRelation(partRelId):
            self.storage.createRelation(partRelId, partSchema)
            self.partitionFiles[int(left)][partitionId] = partRelId

        partFile = self.storage.fileMgr.relationFile(partRelId)[1]
        if partFile:
            partFile.insertTuple(partitionTuple)

    # Return pairs of pages from matching partitions.
    def partitionPairs(self):
        lKeys = self.partitionFiles[1].keys()
        rKeys = self.partitionFiles[0].keys()
        matches = [(self.partitionFiles[1][partId], self.partitionFiles[0][partId]) \
                    for partId in lKeys if partId in rKeys]
        return PartitionIterator(matches, self.storage)

    # Delete all existing partition files.
    def removePartitionFiles(self):
        for lPartRelId in self.partitionFiles[0].values():
            self.storage.removeRelation(lPartRelId)

        for rPartRelId in self.partitionFiles[1].values():
            self.storage.removeRelation(rPartRelId)

        self.partitionFiles = {0: {}, 1: {}}

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            exprs = "(expr='" + str(self.joinExpr) + "')"

        elif self.joinMethod == "indexed":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")"

        elif self.joinMethod == "hash":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + [
                    "lhsKeySchema=" + self.lhsKeySchema.toString(),
                    "rhsKeySchema=" + self.rhsKeySchema.toString(),
                    "lhsHashFn='" + self.lhsHashFn + "'",
                    "rhsHashFn='" + self.rhsHashFn + "'"
                ]))) + ")"

        return super().explain() + exprs

    # We override the cost model here.
    # This cost model cannot be compatible with the general
    # operators' costs so that it is not used during Join
    # Optimization.
    def localCost(self, estimated):

        if estimated:
            l_inputPages = 0
            r_inputPages = 0
            fileSize = 0
            pageBlockNum = 0
            try:
                _, l_inputPages, _ = self.storage.relationStats(
                    self.lhsPlan.relationId())
                _, r_inputPages, _ = self.storage.relationStats(
                    self.rhsPlan.relationId())
                pageBlockNum = math.ceil(l_inputPages /
                                         self.storage.bufferPool.numPages())
            except:
                pass

            l_inputPages *= self.sampleFactor
            r_inputPages *= self.sampleFactor

        if (self.joinMethod == "nested-loops"):
            local_cost = l_inputPages + self.lhsPlan.cardinality * r_inputPages
        elif (self.joinMethod == "block-nested-loops"):
            local_cost = l_inputPages + pageBlockNum * r_inputPages
        # We don't support indexed
        # elif (self.joinMethod == "indexed"):
        # index_pages = self.storage.fileMgr.getIndex(self.indexID).numPages(); Not verified with BDB index file
        # rmatch_pages = ?
        # local_cost = l_inputPages + self.lhsPlan.cardinality * (index_pages + rmatch_pages);

        elif (self.joinMethod == "hash"):
            local_cost = 3 * (l_inputPages + r_inputPages)
        return local_cost
Beispiel #15
0
class Join(Operator):
    def __init__(self, lhsPlan, rhsPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError("Pipelined join operator not supported")

        self.lhsPlan = lhsPlan
        self.rhsPlan = rhsPlan
        self.joinExpr = kwargs.get("expr", None)
        self.joinMethod = kwargs.get("method", None)
        self.lhsSchema = kwargs.get(
            "lhsSchema", None if lhsPlan is None else lhsPlan.schema())
        self.rhsSchema = kwargs.get(
            "rhsSchema", None if rhsPlan is None else rhsPlan.schema())

        self.lhsKeySchema = kwargs.get("lhsKeySchema", None)
        self.rhsKeySchema = kwargs.get("rhsKeySchema", None)
        self.lhsHashFn = kwargs.get("lhsHashFn", None)
        self.rhsHashFn = kwargs.get("rhsHashFn", None)

        self.validateJoin()
        self.initializeSchema()
        self.initializeMethod(**kwargs)

    # Checks the join parameters.
    def validateJoin(self):
        # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
        if self.joinMethod not in [
                "nested-loops", "block-nested-loops", "indexed", "hash"
        ]:
            raise ValueError("Invalid join method in join operator")

        # Check all fields are valid.
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            methodParams = [self.joinExpr]

        elif self.joinMethod == "indexed":
            methodParams = [self.lhsKeySchema]

        elif self.joinMethod == "hash":
            methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                            self.rhsHashFn, self.rhsKeySchema]

        requireAllValid = [self.lhsPlan, self.rhsPlan, \
                           self.joinMethod, \
                           self.lhsSchema, self.rhsSchema ] \
                           + methodParams

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete join specification, missing join operator parameter"
            )

        # For now, we assume that the LHS and RHS schema have
        # disjoint attribute names, enforcing this here.
        for lhsAttr in self.lhsSchema.fields:
            if lhsAttr in self.rhsSchema.fields:
                raise ValueError(
                    "Invalid join inputs, overlapping schema detected")

    # Initializes the output schema for this join.
    # This is a concatenation of all fields in the lhs and rhs schema.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.lhsSchema.schema() + self.rhsSchema.schema()
        self.joinSchema = DBSchema(schema, fields)

    # Initializes any additional operator parameters based on the join method.
    def initializeMethod(self, **kwargs):
        if self.joinMethod == "indexed":
            self.indexId = kwargs.get("indexId", None)
            if self.indexId is None or self.lhsKeySchema is None:
                raise ValueError("Invalid index for use in join operator")

    # Returns the output schema of this operator
    def schema(self):
        return self.joinSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.lhsSchema, self.rhsSchema]

    # Returns a string describing the operator type
    def operatorType(self):
        readableJoinTypes = {
            'nested-loops': 'NL',
            'block-nested-loops': 'BNL',
            'indexed': 'Index',
            'hash': 'Hash'
        }
        return readableJoinTypes[self.joinMethod] + "Join"

    # Returns child operators if present
    def inputs(self):
        return [self.lhsPlan, self.rhsPlan]

    # Iterator abstraction for join operator.
    def __iter__(self):
        self.initializeOutput()
        # Pipelined join operator is not supported
        self.outputIterator = self.processAllPages()

        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        if self.joinMethod == "nested-loops":
            return self.nestedLoops()

        elif self.joinMethod == "block-nested-loops":
            return self.blockNestedLoops()

        elif self.joinMethod == "indexed":
            return self.indexedNestedLoops()

        elif self.joinMethod == "hash":
            return self.hashJoin()

        else:
            raise ValueError("Invalid join method in join operator")

    ##################################
    #
    # Nested loops implementation
    #
    def nestedLoops(self):
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for lTuple in lhsPage:
                # Load the lhs once per inner loop.
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, rhsPage) in iter(self.rhsPlan):
                    for rTuple in rhsPage:
                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate the join predicate, and output if we have a match.
                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                # No need to track anything but the last output page when in batch mode.
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Block nested loops implementation
    #
    # This attempts to use all the free pages in the buffer pool
    # for its block of the outer relation.

    # Accesses a block of pages from an iterator.
    # This method pins pages in the buffer pool during its access.
    # We track the page ids in the block to unpin them after processing the block.
    def cleanBufferPool(self, bufPool):
        items = list(bufPool.pageMap.items())
        for (pageId, (offset, page, pinned)) in items:
            if pinned > 0:
                continue
            elif page.isDirty():
                bufPool.flushPage(pageId)
            else:
                bufPool.discardPage(pageId)

    def accessPageBlock(self, bufPool, pageIterator):
        block_pageList = []
        self.cleanBufferPool(bufPool)
        inputNotFinished = True
        try:
            while inputNotFinished:
                (pageId, page) = next(pageIterator)
                bufPool.getPage(pageId)
                block_pageList.append(pageId)
                if bufPool.numFreePages() > 2:
                    inputNotFinished = False
        except StopIteration:
            pass
        return block_pageList

    def blockNestedLoops(self):
        pageIterator = iter(self.lhsPlan)
        bufPool = self.storage.bufferPool
        block_pageList = self.accessPageBlock(bufPool, pageIterator)
        while block_pageList:
            for pageId in block_pageList:
                lhsPage = bufPool.getPage(pageId)
                for lTuple in lhsPage:
                    joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)
                    for (rPageId, rhsPage) in iter(self.rhsPlan):
                        for rTuple in rhsPage:
                            joinExprEnv.update(
                                self.loadSchema(self.rhsSchema, rTuple))
                            if eval(self.joinExpr, globals(), joinExprEnv):
                                outputTuple = self.joinSchema.instantiate(*[
                                    joinExprEnv[f]
                                    for f in self.joinSchema.fields
                                ])
                                self.emitOutputTuple(
                                    self.joinSchema.pack(outputTuple))
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]
                bufPool.unpinPage(pageId)
            block_pageList = self.accessPageBlock(bufPool, pageIterator)
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Indexed nested loops implementation
    #
    # TODO: test
    def indexedNestedLoops(self):
        raise NotImplementedError

    ##################################
    #
    # Hash join implementation.
    #
    def hashJoin(self):
        tmpFileL = dict()
        tmpFileR = dict()
        bufPool = self.storage.bufferPool

        self.partition(self.lhsPlan, self.lhsSchema, self.lhsHashFn, tmpFileL,
                       "lhs")
        self.partition(self.rhsPlan, self.rhsSchema, self.rhsHashFn, tmpFileR,
                       "rhs")
        if not self.joinExpr:
            left_id = self.lhsHashFn.split('(')[1].split(')')[0].strip()
            right_id = self.rhsHashFn.split('(')[1].split(')')[0].strip()
            self.joinExpr = left_id + ' == ' + right_id

        for key in tmpFileL.keys():
            if key in tmpFileR:
                relTmpL = tmpFileL[key]
                relTmpR = tmpFileR[key]
                pageIterator_lhs = self.storage.pages(relTmpL)
                for (lPageId, lhsPage) in pageIterator_lhs:
                    for lTuple in lhsPage:
                        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)
                        pageIterator_rhs = self.storage.pages(relTmpR)
                        for (rPageId, rhsPage) in pageIterator_rhs:
                            for rTuple in rhsPage:
                                joinExprEnv.update(
                                    self.loadSchema(self.rhsSchema, rTuple))

                                if eval(self.joinExpr, globals(), joinExprEnv):

                                    outputTuple = self.joinSchema.instantiate(
                                        *[
                                            joinExprEnv[f]
                                            for f in self.joinSchema.fields
                                        ])
                                    self.emitOutputTuple(
                                        self.joinSchema.pack(outputTuple))
                                if self.outputPages:
                                    self.outputPages = [self.outputPages[-1]]
            else:
                continue

        self.cleanBufferPool(bufPool)

        for relTmp in tmpFileL.items():
            self.storage.removeRelation(relTmp)

        for relTmp in tmpFileR.items():
            self.storage.removeRelation(relTmp)

        return self.storage.pages(self.relationId())

    def partition(self, plan, schema, hashFn, tmpFile, relPrefix):
        for (pageId, page) in iter(plan):
            for Tuple in page:

                fieldBindings = self.loadSchema(schema, Tuple)
                hashValue = eval(hashFn, globals(), fieldBindings)

                # Store in temporary buckets (files)
                if not hashValue in tmpFile:
                    relId = str(
                        self.id()) + "_" + relPrefix + "_" + str(hashValue)
                    self.storage.createRelation(relId, schema)
                    tmpFile[hashValue] = relId

                self.storage.insertTuple(tmpFile[hashValue], Tuple)

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            exprs = "(expr='" + str(self.joinExpr) + "')"

        elif self.joinMethod == "indexed":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")"

        elif self.joinMethod == "hash":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + [
                    "lhsKeySchema=" + self.lhsKeySchema.toString(),
                    "rhsKeySchema=" + self.rhsKeySchema.toString(),
                    "lhsHashFn='" + self.lhsHashFn + "'",
                    "rhsHashFn='" + self.rhsHashFn + "'"
                ]))) + ")"

        return super().explain() + exprs
Beispiel #16
0
gratSchema = DBSchema('grant', [('g_id', 'int'), ('g_projectid', 'int'),
                                ('g_source', 'char(30)')])
synSchema1 = DBSchema('syn1', [('a', 'int'), ('b', 'char(30)')])
synSchema2 = DBSchema('syn2', [('c', 'int'), ('d', 'char(30)'), ('e', 'int')])

db.createRelation('department', [('d_id', 'int'), ('d_name', 'char(30)')])
db.createRelation('employee', [('e_id', 'int'), ('e_name', 'char(30)'),
                               ('e_projectid', 'int')])
db.createRelation('project', [('p_id', 'int'), ('p_name', 'char(30)')])
db.createRelation('grant', [('g_id', 'int'), ('g_projectid', 'int'),
                            ('g_source', 'char(30)')])
db.createRelation('syn1', [('a', 'int'), ('b', 'char(30)')])
db.createRelation('syn2', [('c', 'int'), ('d', 'char(30)'), ('e', 'int')])

for tup in [
        deptSchema.pack(deptSchema.instantiate(i, "Nature" + str(i)))
        for i in range(4000)
]:
    _ = db.insertTuple('department', tup)
for tup in [
        deptSchema.pack(deptSchema.instantiate(i, "Science" + str(i)))
        for i in range(4000, 8000)
]:
    _ = db.insertTuple('department', tup)
ename = ["John", "Mike", "Davis", "Alex"]
for tup in [
        emplSchema.pack(emplSchema.instantiate(i, ename[i % 4], i % 10))
        for i in range(8000)
]:
    _ = db.insertTuple('employee', tup)
projectName = [
Beispiel #17
0
class GroupBy(Operator):
    def __init__(self, subPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError(
                "Pipelined group-by-aggregate operator not supported")

        self.subPlan = subPlan
        self.subSchema = subPlan.schema()
        self.groupSchema = kwargs.get("groupSchema", None)
        self.aggSchema = kwargs.get("aggSchema", None)
        self.groupExpr = kwargs.get("groupExpr", None)
        self.aggExprs = kwargs.get("aggExprs", None)
        self.groupHashFn = kwargs.get("groupHashFn", None)

        self.validateGroupBy()
        self.initializeSchema()

    # Perform some basic checking on the group-by operator's parameters.
    def validateGroupBy(self):
        requireAllValid = [self.subPlan, \
                           self.groupSchema, self.aggSchema, \
                           self.groupExpr, self.aggExprs, self.groupHashFn]

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete group-by specification, missing a required parameter"
            )

        if not self.aggExprs:
            raise ValueError(
                "Group-by needs at least one aggregate expression")

        if len(self.aggExprs) != len(self.aggSchema.fields):
            raise ValueError("Invalid aggregate fields: schema mismatch")

    # Initializes the group-by's schema as a concatenation of the group-by
    # fields and all aggregate fields.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        self.outputSchema = DBSchema(schema, fields)

    # Returns the output schema of this operator
    def schema(self):
        return self.outputSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.subPlan.schema()]

    # Returns a string describing the operator type
    def operatorType(self):
        return "GroupBy"

    # Returns child operators if present
    def inputs(self):
        return [self.subPlan]

    # Iterator abstraction for selection operator.
    def __iter__(self):
        self.initializeOutput()
        self.outputIterator = self.processAllPages()

        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):

        relations = []

        for (pageId, page) in iter(self.subPlan):
            for tup in page:

                unpackedTup = self.subSchema.unpack(tup)
                groupByVal = tuple([self.groupExpr(unpackedTup)])
                hashVal = str(self.groupHashFn(groupByVal))

                if hashVal not in relations:
                    self.storage.createRelation(hashVal, self.subSchema)
                    relations.append(hashVal)

                self.storage.insertTuple(hashVal, tup)

        for rel in relations:
            for (pageId, page) in self.storage.pages(rel):
                groups = {}
                for tup in page:

                    unpackedTup = self.subSchema.unpack(tup)
                    groupByVal = tuple([self.groupExpr(unpackedTup)])

                    if groupByVal not in groups.keys():
                        groups[groupByVal] = [
                            aggExpr[0] for aggExpr in self.aggExprs
                        ]

                    for i, aggExpr in enumerate(self.aggExprs):
                        groups[groupByVal][i] = aggExpr[1](
                            groups[groupByVal][i], unpackedTup)

                for groupByVal in groups.keys():
                    for i, aggExpr in enumerate(self.aggExprs):
                        groups[groupByVal][i] = aggExpr[2](
                            groups[groupByVal][i])

                    outputTuple = self.outputSchema.instantiate(
                        *it.chain(list(groupByVal), groups[groupByVal]))
                    self.emitOutputTuple(self.outputSchema.pack(outputTuple))

                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        for rel in relations:
            self.storage.removeRelation(rel)

        return self.storage.pages(self.relationId())

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
               + ", aggSchema=" + self.aggSchema.toString() + ")"
Beispiel #18
0
class GroupBy(Operator):
    def __init__(self, subPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError(
                "Pipelined group-by-aggregate operator not supported")

        self.subPlan = subPlan
        self.subSchema = subPlan.schema()
        self.groupSchema = kwargs.get("groupSchema", None)
        self.aggSchema = kwargs.get("aggSchema", None)
        self.groupExpr = kwargs.get("groupExpr", None)
        self.aggExprs = kwargs.get("aggExprs", None)
        self.groupHashFn = kwargs.get("groupHashFn", None)
        self.validateGroupBy()
        self.initializeSchema()

    # Perform some basic checking on the group-by operator's parameters.
    def validateGroupBy(self):
        requireAllValid = [self.subPlan, \
                           self.groupSchema, self.aggSchema, \
                           self.groupExpr, self.aggExprs, self.groupHashFn ]

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete group-by specification, missing a required parameter"
            )

        if not self.aggExprs:
            raise ValueError(
                "Group-by needs at least one aggregate expression")

        if len(self.aggExprs) != len(self.aggSchema.fields):
            raise ValueError("Invalid aggregate fields: schema mismatch")

    # Initializes the group-by's schema as a concatenation of the group-by
    # fields and all aggregate fields.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        self.outputSchema = DBSchema(schema, fields)

    # Returns the output schema of this operator
    def schema(self):
        return self.outputSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.subPlan.schema()]

    # Returns a string describing the operator type
    def operatorType(self):
        return "GroupBy"

    # Returns child operators if present
    def inputs(self):
        return [self.subPlan]

    # Iterator abstraction for selection operator.
    def __iter__(self):
        self.initializeOutput()
        # Pipelined join operator is not supported
        self.outputIterator = self.processAllPages()

        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        Map = dict()
        # partition the schema into several files in different attributes
        self.partition(Map)

        for key, title in Map.items():
            # Generate a pageIterator in the file
            pageIterator = self.storage.pages(title)

            # Generate an dictionary on intermediate aggregation results
            aggregator = {}
            # Get the tuple in the page
            for _, page in pageIterator:
                for Tuple in page:
                    tuple_Unpacked = self.subSchema.unpack(Tuple)
                    key = self.groupExpr(tuple_Unpacked)
                    if type(key) is tuple:
                        key = key
                    else:
                        key = key,

                    val = self.groupHashFn(key)
                    intermediate_results = aggregator.get(val, None)

                    # if the intermediate_result has not generated, form one
                    if not intermediate_results:
                        intermediate_results = list()
                        aggregator[val] = intermediate_results
                        for aggExpr in self.aggExprs:
                            intermediate_results.append(aggExpr[0])
                    index = 0
                    # Perform the aggregation function
                    for aggExpr in self.aggExprs:
                        intermediate_result = intermediate_results[index]
                        intermediate_results[index] = aggExpr[1](
                            intermediate_result, tuple_Unpacked)
                        index += 1

            for val, intermediate_results in aggregator.items():
                index = 0
                for aggExpr in self.aggExprs:
                    intermediate_result = intermediate_results[index]
                    intermediate_results[index] = aggExpr[2](
                        intermediate_result)
                    index += 1

                outputList = itertools.chain([val], intermediate_results)
                outputTuple = self.outputSchema.instantiate(*outputList)
                self.emitOutputTuple(self.outputSchema.pack(outputTuple))

            if self.outputPages:
                self.outputPages = [self.outputPages[-1]]

        # remove the temporary relation created
        for _, title in Map.items():
            self.storage.removeRelation(title)

        return self.storage.pages(self.relationId())

    def partition(self, relMap):
        for (pageId, page) in iter(self.subPlan):
            for Tuple in page:
                tuple_Unpacked = self.subSchema.unpack(Tuple)
                key = self.groupExpr(tuple_Unpacked)
                if type(key) is tuple:
                    key = key
                else:
                    key = key,

                value = self.groupHashFn(key)
                # if this key is not in relation map, we should create a temperory file to contain these tuples with this key
                if not value in relMap:
                    title = str(self.id()) + "_grp_" + str(value)
                    self.storage.createRelation(title, self.subSchema)
                    relMap[value] = title
                self.storage.insertTuple(relMap[value], Tuple)

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                                 + ", aggSchema=" + self.aggSchema.toString() + ")"
Beispiel #19
0
class GroupBy(Operator):
    def __init__(self, subPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError(
                "Pipelined group-by-aggregate operator not supported")

        self.subPlan = subPlan
        self.subSchema = subPlan.schema()
        self.groupSchema = kwargs.get("groupSchema", None)
        self.aggSchema = kwargs.get("aggSchema", None)
        self.groupExpr = kwargs.get("groupExpr", None)
        self.aggExprs = kwargs.get("aggExprs", None)
        self.groupHashFn = kwargs.get("groupHashFn", None)

        self.validateGroupBy()
        self.initializeSchema()

    # Perform some basic checking on the group-by operator's parameters.
    def validateGroupBy(self):
        requireAllValid = [self.subPlan, \
                           self.groupSchema, self.aggSchema, \
                           self.groupExpr, self.aggExprs, self.groupHashFn ]

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete group-by specification, missing a required parameter"
            )

        if not self.aggExprs:
            raise ValueError(
                "Group-by needs at least one aggregate expression")

        if len(self.aggExprs) != len(self.aggSchema.fields):
            raise ValueError("Invalid aggregate fields: schema mismatch")

    # Initializes the group-by's schema as a concatenation of the group-by
    # fields and all aggregate fields.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        self.outputSchema = DBSchema(schema, fields)

    # Returns the output schema of this operator
    def schema(self):
        return self.outputSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.subPlan.schema()]

    # Returns a string describing the operator type
    def operatorType(self):
        return "GroupBy"

    # Returns child operators if present
    def inputs(self):
        return [self.subPlan]

    # Iterator abstraction for selection operator.
    def __iter__(self):
        self.initializeOutput()
        self.partitionFiles = {}
        self.outputIterator = self.processAllPages()
        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Processing helpers
    def ensureTuple(self, x):
        if not isinstance(x, tuple):
            return (x, )
        else:
            return x

    def initialExprs(self):
        return [i[0] for i in self.aggExprs]

    def incrExprs(self):
        return [i[1] for i in self.aggExprs]

    def finalizeExprs(self):
        return [i[2] for i in self.aggExprs]

    # Set-at-a-time operator processing
    def processAllPages(self):
        # Create partitions of the input records by hashing the group-by values
        for (pageId, page) in self.subPlan:
            for tup in page:
                groupVal = self.ensureTuple(
                    self.groupExpr(self.subSchema.unpack(tup)))
                groupId = self.groupHashFn(groupVal)
                self.emitPartitionTuple(groupId, tup)

        # We assume that the partitions fit in main memory.
        for partRelId in self.partitionFiles.values():
            partFile = self.storage.fileMgr.relationFile(partRelId)[1]

            # Use an in-memory Python dict to accumulate the aggregates.
            aggregates = {}
            for (pageId, page) in partFile.pages():
                for tup in page:
                    # Evaluate group-by value.
                    namedTup = self.subSchema.unpack(tup)
                    groupVal = self.ensureTuple(self.groupExpr(namedTup))

                    # Look up the aggregate for the group.
                    if groupVal not in aggregates:
                        aggregates[groupVal] = self.initialExprs()

                    # Increment the aggregate.
                    aggregates[groupVal] = \
                      list(map( \
                        lambda x: x[0](x[1], namedTup), \
                        zip(self.incrExprs(), aggregates[groupVal])))

            # Finalize the aggregate value for each group.
            for (groupVal, aggVals) in aggregates.items():
                finalVals = list(
                    map(lambda x: x[0](x[1]), zip(self.finalizeExprs(),
                                                  aggVals)))
                outputTuple = self.outputSchema.instantiate(*(list(groupVal) +
                                                              finalVals))
                self.emitOutputTuple(self.outputSchema.pack(outputTuple))

            # No need to track anything but the last output page when in batch mode.
            if self.outputPages:
                self.outputPages = [self.outputPages[-1]]

        # Clean up partitions.
        self.removePartitionFiles()

        # Return an iterator for the output file.
        return self.storage.pages(self.relationId())

    # Bucket construction helpers.
    def partitionRelationId(self, partitionId):
        return self.operatorType() + str(self.id()) + "_" \
                + "part_" + str(partitionId)

    def emitPartitionTuple(self, partitionId, partitionTuple):
        partRelId = self.partitionRelationId(partitionId)

        # Create a partition file as needed.
        if not self.storage.hasRelation(partRelId):
            self.storage.createRelation(partRelId, self.subSchema)
            self.partitionFiles[partitionId] = partRelId

        partFile = self.storage.fileMgr.relationFile(partRelId)[1]
        if partFile:
            partFile.insertTuple(partitionTuple)

    # Delete all existing partition files.
    def removePartitionFiles(self):
        for partRelId in self.partitionFiles.values():
            self.storage.removeRelation(partRelId)
        self.partitionFiles = {}

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                                 + ", aggSchema=" + self.aggSchema.toString() + ")"

    def localCost(self, estimated):
        t = self.subPlan.cardinality(estimated)
        p = t / (self.storage.bufferPool.pageSize / self.subPlan.schema().size)

        return p + p
Beispiel #20
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  def localCost(self, estimated):
    tupleSizeLeft = self.lhsPlan.schema().size
    numTuplesLeft = self.lhsPlan.cardinality(estimated)
    tupleSizeRight = self.rhsPlan.schema().size
    numTuplesRight = self.rhsPlan.cardinality(estimated)
    pageSize = self.storage.bufferPool.pageSize

    numPagesLeft = (tupleSizeLeft * numTuplesLeft) // pageSize
    numPagesRight = (tupleSizeRight * numTuplesRight) // pageSize

    if self.joinMethod == "nested-loops":
      return (numTuplesLeft * self.tupleCost * numTuplesRight * self.tupleCost) + (numTuplesLeft * self.tupleCost)
      #return (numTuplesLeft * numPagesRight) + numPagesLeft
    elif self.joinMethod == "block-nested-loops":
      return (numTuplesLeft * self.tupleCost) + (((numTuplesLeft * self.tupleCost)// (self.storage.bufferPool.numPages() - 2)) * (numTuplesRight * self.tupleCost))
      #return numPagesLeft + ((numPagesLeft // (self.storage.bufferPool.numPages() - 2)) * numPagesRight)
    elif self.joinMethod == "indexed":
      raise NotImplementedError
    elif self.joinMethod == "hash":
      return 3 * ((numTuplesLeft * self.tupleCost) + (numTuplesRight * self.tupleCost))
    else:
      return None
  
  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    self.initializeOutput()
    self.partitionFiles = {0:{}, 1:{}}
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    for (lPageId, lhsPage) in self.lhsPlan:
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in self.rhsPlan:
          for rTuple in rhsPage:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            if eval(self.joinExpr, globals(), joinExprEnv):
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    pageBlock = []
    try:
      while True:
        (pageId, page) = next(pageIterator)
        pageBlock.append((pageId, page))
        bufPool.pinPage(pageId)
        if bufPool.numFreePages() == 0:
          break
    except StopIteration:
      pass

    return pageBlock

  def blockNestedLoops(self):
    # Access the outer relation's block, pinning pages in the buffer pool.
    bufPool    = self.storage.bufferPool
    lhsIter    = iter(self.lhsPlan)
    lPageBlock = self.accessPageBlock(bufPool, lhsIter)

    while lPageBlock:
      for (lPageId, lhsPage) in lPageBlock:
        for lTuple in lhsPage:
          # Load the lhs once per inner loop.
          joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

          for (rPageId, rhsPage) in self.rhsPlan:
            for rTuple in rhsPage:
              # Load the RHS tuple fields.
              joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

              # Evaluate the join predicate, and output if we have a match.
              if eval(self.joinExpr, globals(), joinExprEnv):
                outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
                self.emitOutputTuple(self.joinSchema.pack(outputTuple))

          # No need to track anything but the last output page when in batch mode.
          if self.outputPages:
            self.outputPages = [self.outputPages[-1]]

        # Unpin the page after joining with the RHS relation.
        # Thus future accesses can evict the page while reading the next block.
        bufPool.unpinPage(lPageId)

      # Move to the next page block after processing it.
      lPageBlock = self.accessPageBlock(bufPool, lhsIter)

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Indexed nested loops implementation
  #
  # TODO: test
  def indexedNestedLoops(self):
    if self.storage.getIndex(self.indexId) is None:
      raise ValueError("Missing index in storage manager: %s" % self.indexId)
    if self.indexId:
      bufPool = self.storage.bufferPool
      for (lPageId, lhsPage) in self.lhsPlan:
        for lTuple in lhsPage:
          # Load the lhs once per inner loop.
          joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

          # Match against RHS tuples using the index.
          joinKey = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
          matches = self.storage.fileMgr.lookupByIndex(self.rhsPlan.relationId(), self.indexId, joinKey)

          for rhsTupId in matches:
            rhsPage = bufPool.getPage(rhsTupId.pageId)
            rTuple  = rhsPage.getTuple(rhsTupId)

            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate any remaining join predicate, and output if we have a match.
            fullMatch = eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True
            if fullMatch:
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

          # No need to track anything but the last output page when in batch mode.
          if self.outputPages:
            self.outputPages = [self.outputPages[-1]]

      # Return an iterator to the output relation
      return self.storage.pages(self.relationId())

    else:
      raise ValueError("No index found while using an indexed nested loops join")


  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    # Partition the LHS and RHS inputs, creating a temporary file for each partition.
    # We assume one-level of partitioning is sufficient and skip recurring.
    for (lPageId, lPage) in self.lhsPlan:
      for lTuple in lPage:
        lPartEnv = self.loadSchema(self.lhsSchema, lTuple)
        lPartKey = eval(self.lhsHashFn, globals(), lPartEnv)
        self.emitPartitionTuple(lPartKey, lTuple, left=True)

    for (rPageId, rPage) in self.rhsPlan:
      for rTuple in rPage:
        rPartEnv = self.loadSchema(self.rhsSchema, rTuple)
        rPartKey = eval(self.rhsHashFn, globals(), rPartEnv)
        self.emitPartitionTuple(rPartKey, rTuple, left=False)

    # Iterate over partition pairs and output matches
    # evaluating the join expression as necessary.
    for ((lPageId, lPage), (rPageId, rPage)) in self.partitionPairs():
      for lTuple in lPage:
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)
        for rTuple in rPage:
          joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))
          output = \
            ( self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) \
                == self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) ) \
            and ( eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True )

          if output:
            outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
            self.emitOutputTuple(self.joinSchema.pack(outputTuple))

      # No need to track anything but the last output page when in batch mode.
      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    # Clean up partitions.
    self.removePartitionFiles()

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())

  # Hash join helpers.
  def partitionRelationId(self, left, partitionId):
    return self.operatorType() + str(self.id()) + "_" \
            + ("l" if left else "r") + "part_" + str(partitionId)

  def emitPartitionTuple(self, partitionId, partitionTuple, left=False):
    partRelId  = self.partitionRelationId(left, partitionId)
    partSchema = self.lhsSchema if left else self.rhsSchema

    # Create a partition file as needed.
    if not self.storage.hasRelation(partRelId):
      self.storage.createRelation(partRelId, partSchema)
      self.partitionFiles[int(left)][partitionId] = partRelId

    partFile = self.storage.fileMgr.relationFile(partRelId)[1]
    if partFile:
      partFile.insertTuple(partitionTuple)

  # Return pairs of pages from matching partitions.
  def partitionPairs(self):
    lKeys = self.partitionFiles[0].keys()
    rKeys = self.partitionFiles[1].keys()
    matches = [(self.partitionFiles[0][partId], self.partitionFiles[1][partId]) \
                for partId in lKeys if partId in rKeys]
    return PartitionIterator(matches, self.storage)

  # Delete all existing partition files.
  def removePartitionFiles(self):
    for lPartRelId in self.partitionFiles[0].values():
      self.storage.removeRelation(lPartRelId)

    for rPartRelId in self.partitionFiles[1].values():
      self.storage.removeRelation(rPartRelId)

    self.partitionFiles = {0:{}, 1:{}}


  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Beispiel #21
0
class Join(Operator):
    def __init__(self, lhsPlan, rhsPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError("Pipelined join operator not supported")

        self.lhsPlan = lhsPlan
        self.rhsPlan = rhsPlan
        self.joinExpr = kwargs.get("expr", None)
        self.joinMethod = kwargs.get("method", None)
        self.lhsSchema = kwargs.get(
            "lhsSchema", None if lhsPlan is None else lhsPlan.schema())
        self.rhsSchema = kwargs.get(
            "rhsSchema", None if rhsPlan is None else rhsPlan.schema())

        self.lhsKeySchema = kwargs.get("lhsKeySchema", None)
        self.rhsKeySchema = kwargs.get("rhsKeySchema", None)
        self.lhsHashFn = kwargs.get("lhsHashFn", None)
        self.rhsHashFn = kwargs.get("rhsHashFn", None)
        self.blockIds = []

        self.validateJoin()
        self.initializeSchema()
        self.initializeMethod(**kwargs)

    # Checks the join parameters.
    def validateJoin(self):
        # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
        if self.joinMethod not in [
                "nested-loops", "block-nested-loops", "indexed", "hash"
        ]:

            raise ValueError("Invalid join method in join operator")

        # Check all fields are valid.
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            methodParams = [self.joinExpr]

        elif self.joinMethod == "indexed":
            methodParams = [self.lhsKeySchema]

        elif self.joinMethod == "hash":
            methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                            self.rhsHashFn, self.rhsKeySchema]

        requireAllValid = [self.lhsPlan, self.rhsPlan, \
                           self.joinMethod, \
                           self.lhsSchema, self.rhsSchema ] \
                           + methodParams

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete join specification, missing join operator parameter"
            )

        # For now, we assume that the LHS and RHS schema have
        # disjoint attribute names, enforcing this here.
        for lhsAttr in self.lhsSchema.fields:
            if lhsAttr in self.rhsSchema.fields:
                raise ValueError(
                    "Invalid join inputs, overlapping schema detected")

    # Initializes the output schema for this join.
    # This is a concatenation of all fields in the lhs and rhs schema.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.lhsSchema.schema() + self.rhsSchema.schema()
        self.joinSchema = DBSchema(schema, fields)

    # Initializes any additional operator parameters based on the join method.
    def initializeMethod(self, **kwargs):
        if self.joinMethod == "indexed":
            self.indexId = kwargs.get("indexId", None)
            if self.indexId is None or self.lhsKeySchema is None:
                raise ValueError("Invalid index for use in join operator")

    # Returns the output schema of this operator
    def schema(self):
        return self.joinSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.lhsSchema, self.rhsSchema]

    # Returns a string describing the operator type
    def operatorType(self):
        readableJoinTypes = {
            'nested-loops': 'NL',
            'block-nested-loops': 'BNL',
            'indexed': 'Index',
            'hash': 'Hash'
        }
        return readableJoinTypes[self.joinMethod] + "Join"

    # Returns child operators if present
    def inputs(self):
        return [self.lhsPlan, self.rhsPlan]

    # Iterator abstraction for join operator.
    def __iter__(self):
        self.initializeOutput()
        return iter(self.processAllPages())

    def __next__(self):
        raise NotImplementedError

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        if self.joinMethod == "nested-loops":
            return self.nestedLoops()

        elif self.joinMethod == "block-nested-loops":
            return self.blockNestedLoops()

        elif self.joinMethod == "indexed":
            return self.indexedNestedLoops()

        elif self.joinMethod == "hash":
            return self.hashJoin()

        else:
            raise ValueError("Invalid join method in join operator")

    ##################################
    #
    # Nested loops implementation
    #
    def nestedLoops(self):
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for lTuple in lhsPage:
                # Load the lhs once per inner loop.
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, rhsPage) in iter(self.rhsPlan):
                    for rTuple in rhsPage:
                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate the join predicate, and output if we have a match.
                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                # No need to track anything but the last output page when in batch mode.
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Block nested loops implementation
    #
    # This attempts to use all the free pages in the buffer pool
    # for its block of the outer relation.

    # Accesses a block of pages from an iterator.
    # This method pins pages in the buffer pool during its access.
    # We track the page ids in the block to unpin them after processing the block.
    def accessPageBlock(self, bufPool, pageIterator):
        raise NotImplementedError

    def blockJoin(self):
        for lhsPageId in self.blockIds:
            lhsPage = self.storage.bufferPool.getPage(lhsPageId, pinned=True)
            for lTuple in lhsPage:
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, _) in self.rhsPlan:
                    rhsPage = self.storage.bufferPool.getPage(rPageId,
                                                              pinned=True)
                    # self.storage.bufferPool.pinPage(rPageId)

                    for rTuple in rhsPage:
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                    self.storage.bufferPool.unpinPage(rPageId)

                    if self.outputPages:
                        self.outputPages = [self.outputPages[-1]]

    def blockNestedLoops(self):
        bp = self.storage.bufferPool
        freePages = bp.numFreePages(
        ) + 1  # Save one free page for the page of rhs that we read in
        # Do we need to worry about the number of output pages that we create?
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            bp.getPage(lPageId, pinned=True)
            self.blockIds.append(lPageId)

            if len(
                    self.blockIds
            ) == freePages:  # We've used all the pages available in the bufferPool
                self.blockJoin()
                for pId in self.blockIds:
                    bp.unpinPage(pId)
                self.blockIds = []

        self.blockJoin(
        )  # If we are able to add all the pages we want to our block
        for pId in self.blockIds:
            bp.unpinPage(pId)
        self.blockIds = []

        return self.storage.pages(self.relationId())

    ##################################
    #
    # Indexed nested loops implementation
    #
    # TODO: test
    def indexedNestedLoops(self):
        raise NotImplementedError

    ##################################
    #
    # Hash join implementation.
    #
    def hashJoin(self):
        bp = self.storage.bufferPool

        lPartitions = self.partitionPlan('L', self.lhsPlan, self.lhsHashFn,
                                         self.lhsSchema, self.lhsKeySchema)
        rPartitions = self.partitionPlan('R', self.rhsPlan, self.rhsHashFn,
                                         self.rhsSchema, self.rhsKeySchema)

        for lKey in lPartitions:  # Go through all the keys of our outer partition
            lRelId = lPartitions[lKey]
            lFile = self.storage.fileMgr.relationFile(lRelId)[1]

            if lKey in rPartitions:  # Check if the key is in our inner partition. No need for 'eval' now
                rRelId = rPartitions[lKey]
                rFile = self.storage.fileMgr.relationFile(rRelId)[
                    1]  # Get the file of all tuples w the same hash
                lPages = lFile.pages(pinned=True)

                for (lPageId, lPage) in lPages:
                    for lTuple in lPage:
                        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                        rPages = rFile.pages(pinned=True)
                        for (rPageId, rPage) in rPages:
                            for rTuple in rPage:
                                lKeyCheck = self.lhsSchema.project(
                                    self.lhsSchema.unpack(lTuple),
                                    self.lhsKeySchema)
                                rKeyCheck = self.rhsSchema.project(
                                    self.rhsSchema.unpack(rTuple),
                                    self.rhsKeySchema)

                                if lKeyCheck == rKeyCheck:
                                    joinExprEnv.update(
                                        self.loadSchema(
                                            self.rhsSchema, rTuple))
                                    outputTuple = self.joinSchema.instantiate(
                                        *[
                                            joinExprEnv[f]
                                            for f in self.joinSchema.fields
                                        ])
                                    self.emitOutputTuple(
                                        self.joinSchema.pack(outputTuple))

                            bp.unpinPage(rPageId)

                    bp.unpinPage(lPageId)

        for key in lPartitions:
            self.storage.removeRelation(lPartitions[key])
        for key in rPartitions:
            self.storage.removeRelation(rPartitions[key])

        if self.outputPages:
            self.outputPages = [self.outputPages[-1]]

        return self.storage.pages(self.relationId())

        # Partition both rhs and lhs into partition files
        # Read one partition file at a time
        # Block join each partition file

    def partitionPlan(self, planSide, plan, hashFn, planSchema, keySchema):
        partitionFiles = {}

        for (pageId, page) in plan:
            for tuple in page:
                joinExprEnv = self.loadSchema(planSchema, tuple)
                bucket = eval(hashFn, globals(), joinExprEnv)

                if bucket not in partitionFiles:
                    relId = self.relationId() + '_' + planSide + '_' + str(
                        bucket)
                    if self.storage.hasRelation(relId):
                        self.storage.removeRelation(relId)
                    self.storage.createRelation(relId, planSchema)

                    file = self.storage.fileMgr.relationFile(relId)[1]

                    file.insertTuple(tuple)
                    partitionFiles[bucket] = relId
                else:
                    relId = partitionFiles[bucket]
                    file = self.storage.fileMgr.relationFile(relId)[1]
                    file.insertTuple(tuple)

        return partitionFiles

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            exprs = "(expr='" + str(self.joinExpr) + "')"

        elif self.joinMethod == "indexed":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")"

        elif self.joinMethod == "hash":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + [
                    "lhsKeySchema=" + self.lhsKeySchema.toString(),
                    "rhsKeySchema=" + self.rhsKeySchema.toString(),
                    "lhsHashFn='" + self.lhsHashFn + "'",
                    "rhsHashFn='" + self.rhsHashFn + "'"
                ]))) + ")"

        return super().explain() + exprs