Esempio n. 1
0
def query2Hash_test(db, printOutput):
    query2 = db.query().fromTable('part') \
        .join(db.query().fromTable('lineitem').where('L_RETURNFLAG == \'R\''),
              rhsSchema=db.relationSchema('lineitem'),
              method='hash',
              lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]),
              rhsHashFn='hash(L_PARTKEY) % 111', rhsKeySchema=DBSchema('L_PARTKEY', [('L_PARTKEY', 'int')])) \
            .groupBy(groupSchema=DBSchema('P_NAME', [('P_NAME', 'char(55)')]),
                    aggSchema=DBSchema('COUNT', [('COUNT', 'int')]),
                    groupExpr=(lambda e: e.P_NAME),
                    aggExprs=[(0, lambda acc, e: acc + 1, lambda x: x)],
                    groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
        .finalize()

    start = time.time()
    processedQuery = [
        query2.schema().unpack(tup) for page in db.processQuery(query2)
        for tup in page[1]
    ]
    end = time.time()
    duration = end - start
    print('Query: Query 2, Hash')
    if printOutput:
        print('Results: ', processedQuery)
    print('Time', duration)
    return
Esempio n. 2
0
def query1(db):
    query = db.query().fromTable('lineitem')\
        .where(
            "(L_SHIPDATE >= 19940101) and (L_SHIPDATE < 19950101) and (0.06-0.01 <= L_DISCOUNT <= 0.06 + 0.01) and (L_QUANTITY < 24")\
        .groupBy(
            groupSchema=DBSchema('groupKey', [('ONE', 'int')]),
            groupExpr=(lambda e: 1),
            aggSchema=DBSchema('groupBy', [('revenue', 'float')]),
            aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * e.L_DISCOUNT), lambda x: x)], \
            groupHashFn=(lambda gbVal: hash(gbVal) % 1))\
        .select(
            {'revenue': ('revenue', 'float')})\
        .finalize()
    return query
Esempio n. 3
0
 def testHashJoin(self):
     schema = self.db.relationSchema('employee')
     e2schema = schema.rename('employee2', {'id': 'id2', 'age': 'age2', 'dept_id': 'dept_id2'})
     keySchema = DBSchema('employeeKey', [('id', 'int')])
     keySchema2 = DBSchema('employeeKey2', [('id2', 'int')])
     hashJoin = self.db.query().fromTable('employee').join( \
         self.db.query().fromTable('employee'), \
         rhsSchema=e2schema, \
         method='hash', \
         lhsHashFn='hash(id) % 4', lhsKeySchema=keySchema, \
         rhsHashFn='hash(id2) % 4', rhsKeySchema=keySchema2, \
         ).finalize()
     results = self.getResults(hashJoin)
     self.assertEqual(len(results), self.numEmployees)
Esempio n. 4
0
 def validateSchema(self):
     if self.lhsPlan.schema().match(self.rhsPlan.schema()):
         schemaName = self.operatorType() + str(self.id())
         schemaFields = self.lhsPlan.schema().schema()
         self.unionSchema = DBSchema(schemaName, schemaFields)
     else:
         raise ValueError("Union operator type error, mismatched input schemas")
Esempio n. 5
0
 def unpack(cls, buffer):
     brepr = cls.binrepr(buffer)
     values = brepr.unpack_from(buffer)
     if len(values) == 7:
         pageClass = pickle.loads(values[5])
         schema = DBSchema.unpackSchema(values[6])
         return FileHeader(numTuples=values[1], pageSize=values[2], pageClass=pageClass, schema=schema)
Esempio n. 6
0
 def unpack(cls, buffer):
   brepr  = cls.binrepr(buffer)
   values = brepr.unpack_from(buffer)
   if len(values) == 6:
     pageClass = pickle.loads(values[4])
     schema    = DBSchema.unpackSchema(values[5])
     return FileHeader(pageSize=values[1], pageClass=pageClass, schema=schema)
Esempio n. 7
0
 def createRelation(self, relationName, relationFields):
     if relationName not in self.relationMap:
         schema = DBSchema(relationName, relationFields)
         self.relationMap[relationName] = schema
         self.storage.createRelation(relationName, schema)
         self.checkpoint()
     else:
         raise ValueError("Relation '" + relationName + "' already exists")
Esempio n. 8
0
def query2(db):
    query = db.query().fromTable('part')\
        .join(
            db.query().fromTable('lineitem').where("(L_SHIPDATE >= 19950901) and (L_SHIPDATE < 19951001)"),
            method='hash',
            lhsHashFn='hash(P_PARTKEY) % 7', lhsKeySchema=DBSchema('partkey2',[('P_PARTKEY', 'int')]),
            rhsHashFn='hash(L_PARTKEY) % 7', rhsKeySchema=DBSchema('partkey1', [('L_PARTKEY', 'int')])) \
        .groupBy(
            groupSchema=DBSchema('groupKey', [('ONE', 'int')]),
            aggSchema=DBSchema('groupBy', [('promo_revenue', 'float')]),
            groupExpr=(lambda e: 1),
            aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)],
            groupHashFn=(lambda gbVal: hash(gbVal) % 1))\
        .select(
            {'promo_revenue' : ('promo_revenue', 'float')}).finalize()

    return query
Esempio n. 9
0
def query3bBNL_test(db, printOutput):
    query3 = db.query().fromTable('part') \
        .join(db.query().fromTable('lineitem'),
              rhsSchema=db.relationSchema('lineitem'),
              method='block-nested-loops',
              expr='L_PARTKEY == P_PARTKEY') \
        .join(db.query().fromTable('orders'),
              rhsSchema=db.relationSchema('orders'),
              method='block-nested-loops',
              expr='L_ORDERKEY == O_ORDERKEY') \
        .join(db.query().fromTable('customer'),
              rhsSchema=db.relationSchema('customer'),
              method='block-nested-loops',
              expr='O_CUSTKEY == C_CUSTKEY') \
        .join(db.query().fromTable('nation'),
              rhsSchema=db.relationSchema('nation'),
              method='block-nested-loops',
              expr='C_NATIONKEY == N_NATIONKEY') \
        .groupBy(groupSchema=DBSchema('FIRST', [('N_NAME', 'char(25)'), ('P_NAME', 'char(55)')]),
                 aggSchema=DBSchema('SUM', [('SUM', 'double')]),
                 groupExpr=(lambda e: (e.N_NAME, e.P_NAME)),
                 aggExprs=[(0, lambda acc, e: acc + e.L_QUANTITY, lambda x: x)],
                 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
        .groupBy(groupSchema=DBSchema('SECOND', [('N_NAME', 'char(25)')]),
                 aggSchema=DBSchema('MAX', [('MAX', 'double')]),
                 groupExpr=(lambda e: e.N_NAME),
                 aggExprs=[(0, lambda acc, e: max(acc, e.SUM), lambda x: x)],
                 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
        .select({'NATION': ('N_NAME', 'char(25)'), 'MAX': ('MAX', 'double')}) \
        .finalize()

    start = time.time()
    processedQuery = [
        query3.schema().unpack(tup) for page in db.processQuery(query3)
        for tup in page[1]
    ]
    end = time.time()
    duration = end - start
    print('Query: Query 3b (Order Reversed), BNL')
    if printOutput:
        print('Results: ', processedQuery)
    print('Time', duration)
    return
Esempio n. 10
0
 def testGroupBy(self):
     # SELECT id, min(age), max(age) FROM Employee GROUP BY (id % 2)
     aggMinMaxSchema = DBSchema('minmax', [('minAge', 'int'), ('maxAge', 'int')])
     keySchema = DBSchema('employeeKey', [('id', 'int')])
     groupBy = self.db.query().fromTable('employee').groupBy( \
         groupSchema=keySchema, \
         aggSchema=aggMinMaxSchema, \
         groupExpr=(lambda e: e.id % 2), \
         aggExprs=[(sys.maxsize, lambda acc, e: min(acc, e.age), lambda x: x), \
                   (0, lambda acc, e: max(acc, e.age), lambda x: x)], \
         groupHashFn=(lambda gbVal: hash(gbVal[0]) % 2) \
         ).finalize()
     results = self.getResults(groupBy)
     self.assertEqual(len(results), 2)
     for result in results:
         if result.id == 0:
             self.assertEqual(result.minAge, 20)
             self.assertEqual(result.maxAge, 56)
         else:
             self.assertEqual(result.minAge, 22)
             self.assertEqual(result.maxAge, 58)
Esempio n. 11
0
def query3(db):
    query = db.query().fromTable('customer')\
        .join(
            db.query().fromTable('orders'),
            method='hash',
            lhsHashFn='hash(C_CUSTKEY) % 5', lhsKeySchema=DBSchema('customerKey1', [('C_CUSTKEY', 'int')]),
            rhsHashFn='hash(O_CUSTKEY) % 5', rhsKeySchema=DBSchema('customerKey2', [('O_CUSTKEY', 'int')]))\
        .join(
            db.query().fromTable('lineitem'),
            method='hash',
            lhsHashFn='hash(O_ORDERKEY) % 5', lhsKeySchema=DBSchema('orderKey1', [('O_ORDERKEY', 'int')]),
            rhsHashFn='hash(L_ORDERKEY) % 5', rhsKeySchema=DBSchema('orderkey2', [('L_ORDERKEY', 'int')]))\
        .where(
            "C_MKTSEGMENT == 'BUILDING' and O_ORDERDATE < 19950315 and L_SHIPDATE > 19950315")\
        .groupBy(
            groupSchema=DBSchema('groupKey', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'int'), ('O_SHIPPRIORITY', 'int')]),
            aggSchema=DBSchema('groupAgg', [('revenue', 'float')]),
            groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)),
            aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)],
            groupHashFn=(lambda gbVal: hash(gbVal) % 10))\
        .select(
            {'l_orderkey': ('L_ORDERKEY', 'int'),
            'revenue': ('revenue', 'float'),
            'o_orderdate': ('O_ORDERDATE', 'int'),
            'o_shippriority': ('O_SHIPPRIORITY', 'int')})\
        .finalize()
    return query
Esempio n. 12
0
    def processAllPages(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        outputSchema = DBSchema(schema, fields)

        relIds = []
        for (pageId, page) in iter(self.subPlan):
            for tpl in page:
                group = self.groupExpr(self.subSchema.unpack(tpl))
                key = self.groupHashFn((group, None))
                relId = str(self.id) + "u" + str(key)
                self.storage.createRelation(relId, self.subSchema)
                self.storage.insertTuple(relId, tpl)
                if relId not in relIds:
                    relIds.append(relId)

        for rid in relIds:
            groupDict = {}
            for (pageId, page) in self.storage.pages(rid):
                for tpl in page:
                    groupKey = self.groupExpr(self.subSchema.unpack(tpl))
                    if groupKey not in groupDict:
                        groupDict[groupKey] = []
                        for trio in self.aggExprs:
                            groupDict[groupKey].append(trio[0])

                    for i in range(len(self.aggExprs)):
                        groupDict[groupKey][i] = self.aggExprs[i][1](
                            groupDict[groupKey][i], self.subSchema.unpack(tpl))

            for key in groupDict:
                for i in range(len(self.aggExprs)):
                    groupDict[key][i] = self.aggExprs[i][2](groupDict[key][i])

            for key in groupDict:
                outTuple = outputSchema.instantiate(
                    key, *[f for f in groupDict[key]])
                self.emitOutputTuple(self.outputSchema.pack(outTuple))
        return self.storage.pages(self.relationId())
Esempio n. 13
0
 def testIndexJoin(self):
     schema = self.db.relationSchema('employee')
     e2schema = schema.rename('employee2', {'id': 'id2', 'age': 'age2', 'dept_id': 'dept_id2'})
     lhsKeySchema = DBSchema('employeeAge', [('age2', 'int')])
     join = self.db.query().fromTable('employee').join( \
         self.db.query().fromTable('employee'), \
         lhsSchema=e2schema, \
         lhsKeySchema=lhsKeySchema, \
         method='indexed', \
         indexId=self.idxId, \
         expr='True'
     ).finalize()
     results = self.getResults(join)
     self.assertEqual(len(results), self.numEmployees)
Esempio n. 14
0
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)

        # Start with a fresh 'employee' table
        self.db = Database.Database()
        self.db.createRelation('employee', [('id', 'int'), ('age', 'int'), ('dept_id', 'int')])

        # Populate employees with department 0 or 1
        self.numEmployees = 20
        empSchema = self.db.relationSchema('employee')

        # Create an index over employee age
        keySchema = DBSchema('employeeAge', [('age', 'int')])
        self.idxId = self.db.storageEngine().createIndex('employee', empSchema, keySchema, False)

        for tup in [empSchema.pack(empSchema.instantiate(i, 2 * i + 20, i % 2)) for i in range(self.numEmployees)]:
            self.db.insertTuple(empSchema.name, tup)
Esempio n. 15
0
def query1Hash_test(db, printOutput):
    query1 = db.query().fromTable('part') \
        .join(db.query().fromTable('partsupp').where('PS_AVAILQTY == 1'),
              rhsSchema=db.relationSchema('partsupp'),
              method='hash',
              lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]),
              rhsHashFn='hash(PS_PARTKEY) % 111', rhsKeySchema=DBSchema('PS_PARTKEY', [('PS_PARTKEY', 'int')])) \
        .join(db.query().fromTable('supplier'),
              rhsSchema=db.relationSchema('supplier'),
              method='hash',
              lhsHashFn='hash(PS_SUPPKEY) % 111', lhsKeySchema=DBSchema('PS_SUPPKEY', [('PS_SUPPKEY', 'int')]),
              rhsHashFn='hash(S_SUPPKEY) % 111', rhsKeySchema=DBSchema('S_SUPPKEY', [('S_SUPPKEY', 'int')])) \
        .union(
        db.query().fromTable('part')
            .join(db.query().fromTable('partsupp').where('PS_SUPPLYCOST < 5'),
                  rhsSchema=db.relationSchema('partsupp'),
                  method='hash',
                  lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]),
                  rhsHashFn='hash(PS_PARTKEY) % 111', rhsKeySchema=DBSchema('PS_PARTKEY', [('PS_PARTKEY', 'int')])) \
            .join(db.query().fromTable('supplier'),
                  rhsSchema=db.relationSchema('supplier'),
                  method='hash',
                  lhsHashFn='hash(PS_SUPPKEY) % 111', lhsKeySchema=DBSchema('PS_SUPPKEY', [('PS_SUPPKEY', 'int')]),
                  rhsHashFn='hash(S_SUPPKEY) % 111', rhsKeySchema=DBSchema('S_SUPPKEY', [('S_SUPPKEY', 'int')]))) \
        .select({'P_NAME': ('P_NAME', 'char(55)'), 'S_NAME': ('S_NAME', 'char(25)')}) \
        .finalize()

    start = time.time()
    processedQuery = [
        query1.schema().unpack(tup) for page in db.processQuery(query1)
        for tup in page[1]
    ]
    end = time.time()
    duration = end - start
    print('Query: Query 1, Hash')
    if printOutput:
        print('Results: ', processedQuery)
    print('Time', duration)
    return
Esempio n. 16
0
def query4(db):
    query = db.query().fromTable('nation')\
        .join(
            db.query().fromTable('customer'),
            method='hash',
            lhsHashFn='hash(N_NATIONKEY) % 11', lhsKeySchema=DBSchema('nationKey1', [('N_NATIONKEY', 'int')]),
            rhsHashFn='hash(C_NATIONKEY) % 11', rhsKeySchema=DBSchema('nationKey2', [('C_NATIONKEY', 'int')]))\
        .join(
            db.query().fromTable('orders'),
            method='hash',
            lhsHashFn='hash(C_CUSTKEY) % 11', lhsKeySchema=DBSchema('custKey1', [('C_CUSTKEY', 'int')]),
            rhsHashFn='hash(O_CUSTKEY) % 11', rhsKeySchema=DBSchema('custKey2', [('O_CUSTKEY', 'int')])) \
        .join(
            db.query().fromTable('lineitem'),
            method='hash',
            lhsHashFn='hash(O_ORDERKEY) % 11', lhsKeySchema=DBSchema('orderKey1', [('O_ORDERKEY', 'int')]),
            rhsHashFn='hash(L_ORDERKEY) % 11', rhsKeySchema=DBSchema('orderKey2', [('L_ORDERKEY', 'int')])) \
        .where(
            "O_ORDERDATE >= 19931001 and O_ORDERDATE < 19940101 and L_RETURNFLAG == 'R'")\
        .groupBy(
            groupSchema=DBSchema('groupKey', [('C_CUSTKEY','int'),('C_NAME','char(25)'),('C_ACCTBAL','float'),('C_PHONE','char(15)'),('N_NAME','char(25)'),('C_ADDRESS','char(40)'),('C_COMMENT','char(117)')]),
            aggSchema=DBSchema('groupAgg', [('revenue', 'float')]),
            groupExpr=(lambda e: (e.C_CUSTKEY, e.C_NAME, e.C_ACCTBAL, e.C_PHONE, e.N_NAME, e.N_NAME, e.C_ADDRESS, e.C_COMMENT)),
            aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)],
            groupHashFn=(lambda gbVal: hash(gbVal) % 11))\
        .select(
            {'c_custkey': ('C_CUSTKEY', 'int'),
             'c_name': ('C_NAME', 'char(25)'),
             'revenue': ('revenue', 'float'),
             'c_acctbal': ('C_ACCTBAL', 'float'),
             'n_name': ('N_NAME', 'char(25)'),
             'c_address': ('C_ADDRESS', 'char(40)'),
             'c_phone': ('C_PHONE', 'char(15)'),
             'c_comment': ('C_COMMENT', 'char(117)')})\
        .finalize()
    return query
Esempio n. 17
0
 def validate(self, operator, storage):
     if operator.operatorType() == "TableScan":
         return operator.schema()
     elif operator.operatorType() == "Select":
         operator.storage = storage
         return self.validate(operator.subPlan, storage)
     elif operator.operatorType() == "Project":
         operator.storage = storage
         self.validate(operator.subPlan, storage)
         return DBSchema( operator.relationId(), \
                             [(k, v[1]) for (k,v) in operator.projectExprs.items()])
     elif operator.operatorType() == "GroupBy":
         self.validate(operator.subPlan, storage)
         operator.storage = storage
         return operator.schema()
     elif operator.operatorType() == "UnionAll":
         operator.storage = storage
         return self.validate(operator.subPlan, storage)
     else:
         operator.lhsSchema = self.validate(operator.lhsPlan, storage)
         operator.rhsSchema = self.validate(operator.rhsPlan, storage)
         operator.initializeSchema()
         operator.storage = storage
         return operator.schema()
Esempio n. 18
0
File: Join.py Progetto: jhaughe1/hw2
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    raise NotImplementedError

  def __next__(self):
    raise NotImplementedError

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    for (lPageId, lhsPage) in iter(self.lhsPlan):
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in iter(self.rhsPlan):
          for rTuple in rhsPage:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            if eval(self.joinExpr, globals(), joinExprEnv):
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    raise NotImplementedError

  def blockNestedLoops(self):
    raise NotImplementedError


  ##################################
  #
  # Indexed nested loops implementation
  #
  # TODO: test
  def indexedNestedLoops(self):
    raise NotImplementedError

  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    raise NotImplementedError

  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Esempio n. 19
0
part = db.query().fromTable('part').select({
    'P_NAME': ('P_NAME', 'char(55)'),
    'P_PARTKEY': ('P_PARTKEY', 'int')
})
partsupp = db.query().fromTable('partsupp').where('PS_AVAILQTY == 1').select({
    'PS_PARTKEY': ('PS_PARTKEY', 'int'),
    'PS_SUPPKEY': ('PS_SUPPKEY', 'int')
})
supplier = db.query().fromTable('supplier').select({
    'S_NAME': ('S_NAME', 'char(25)'),
    'S_SUPPKEY': ('S_SUPPKEY', 'int')
})

join_ps_p = partsupp.join(\
              part, \
              rhsSchema = DBSchema('part', [('P_NAME','char(55)'), ('P_PARTKEY','int')]), \
              method = 'block-nested-loops', \
              expr = 'PS_PARTKEY == P_PARTKEY')

join_three = join_ps_p.join(\
               supplier, \
               rhsSchema = DBSchema('supplier', [('S_NAME','char(25)'), ('S_SUPPKEY', 'int')]), \
               method = 'block-nested-loops',
               expr = 'PS_SUPPKEY == S_SUPPKEY'
               ).select({'P_NAME':('P_NAME','char(55)'), 'S_NAME':('S_NAME','char(25)')})

partsupp2 = db.query().fromTable('partsupp').where('PS_SUPPLYCOST < 5').select(
    {
        'PS_PARTKEY': ('PS_PARTKEY', 'int'),
        'PS_SUPPKEY': ('PS_SUPPKEY', 'int')
    })
Esempio n. 20
0
exp 1

TPC-H Query 6: a 4-chain filter and aggregate query

select
        sum(l_extendedprice * l_discount) as revenue
from
        lineitem
where
        l_shipdate >= 19940101
        and l_shipdate < 19950101
        and l_discount between 0.06 - 0.01 and 0.06 + 0.01
        and l_quantity < 24
'''

groupKeySchema = DBSchema('groupKey', [('ONE', 'int')])
groupAggSchema = DBSchema('groupBy', [('revenue', 'float')])

query1 = db.query().fromTable('lineitem').where( \
            "(L_SHIPDATE >= 19940101) and (L_SHIPDATE < 19950101) and \
            (0.06 - 0.01 <= L_DISCOUNT <= 0.06 + 0.01) and (L_QUANTITY < 24)"                                                                             ).groupBy( \
            groupSchema=groupKeySchema, \
            aggSchema=groupAggSchema, \
            groupExpr=(lambda e: 1), \
            aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * e.L_DISCOUNT), lambda x: x)], \
            groupHashFn=(lambda gbVal: hash(gbVal) % 1)).select( \
            {'revenue' : ('revenue', 'float')}).finalize()
'''
exp 2

TPC-H Query 14: a 2-way join and aggregate query
Esempio n. 21
0
        schema1.pack(
            schema1.instantiate(i, random.randint(0, 50),
                                'This is a testing tuple.'))
        for i in range(0, 10000)
]:
    _ = db.insertTuple(schema1.name, tup)

for tup in [
        schema2.pack(
            schema2.instantiate(i, random.randint(0, 50),
                                'This is a testing tuple.'))
        for i in range(0, 10000)
]:
    _ = db.insertTuple(schema2.name, tup)

keySchema = DBSchema('employeeKey', [('id', 'int')])
keySchema2 = DBSchema('employeeKey2', [('id2', 'int')])

query5 = db.query().fromTable('employeeeee').join( \
          db.query().fromTable('employeeeee2'), \
          rhsSchema=schema2, \
          method='hash', \
          lhsHashFn=(lambda e : e.id % 4),  lhsKeySchema=keySchema, \
          rhsHashFn=(lambda e : e.id2 % 4), rhsKeySchema=keySchema2, \
        ).finalize()

print(query5.explain())


def readResult():
    for page in db.processQuery(query5):
Esempio n. 22
0
 def initializeSchema(self):
   schema = self.operatorType() + str(self.id())
   fields = self.groupSchema.schema() + self.aggSchema.schema()
   self.outputSchema = DBSchema(schema, fields)
Esempio n. 23
0
    def checkJoins(self, joins, allAttributes, cachedLHS, cachedRHS):
        for join in joins:
            if join.joinMethod == 'hash':
                lhsCheck = join.lhsKeySchema.fields  # This is a list
                rhsCheck = join.rhsKeySchema.fields  # This is a list
                joinAttr = lhsCheck + rhsCheck

                contains = all(x in allAttributes for x in joinAttr) and any(
                    y in cachedRHS.schema().fields for y in joinAttr)

                # print(lhsCheck, rhsCheck, contains)
                if contains:
                    keySchema1 = DBSchema('keyschema1', [(lhsCheck[0], 'int')])
                    keySchema2 = DBSchema('keyschema2', [(rhsCheck[0], 'int')])
                    hashFn1 = 'hash(' + lhsCheck[0] + ') % 7'
                    hashFn2 = 'hash(' + rhsCheck[0] + ') % 7'
                    joinExpr = lhsCheck[0] + ' == ' + rhsCheck[0]

                    if lhsCheck[0] in cachedLHS.schema().fields:
                        return True, {
                            'joinMethod': 'hash',
                            'lhsHashFn': hashFn1,
                            'lhsKeySchema': keySchema1,
                            'rhsHashFn': hashFn2,
                            'rhsKeySchema': keySchema2,
                            'joinExpr': joinExpr
                        }
                    else:
                        return True, {
                            'joinMethod': 'hash',
                            'lhsHashFn': hashFn2,
                            'lhsKeySchema': keySchema2,
                            'rhsHashFn': hashFn1,
                            'rhsKeySchema': keySchema1,
                            'joinExpr': joinExpr
                        }

            elif join.joinExpr:
                joinAttr = ExpressionInfo(join.joinExpr).getAttributes()
                contains = all(x in allAttributes for x in joinAttr)

                if contains:
                    joinAttr1 = joinAttr.pop()
                    joinAttr2 = joinAttr.pop()

                    try:
                        joinAttr.pop()
                    except KeyError:
                        pass
                        # print('There are only two attributes in this, as expected')

                    keySchema1 = DBSchema('keySchema1', [([joinAttr1, 'int'])])
                    keySchema2 = DBSchema('keySchema1', [([joinAttr2, 'int'])])
                    hashFn1 = 'hash(' + joinAttr1 + ') % 7'
                    hashFn2 = 'hash(' + joinAttr2 + ') % 7'
                    joinExpr = join.joinExpr

                    if joinAttr1 in cachedLHS.schema().fields:
                        return True, {
                            'joinMethod': 'hash',
                            'lhsHashFn': hashFn1,
                            'lhsKeySchema': keySchema1,
                            'rhsHashFn': hashFn2,
                            'rhsKeySchema': keySchema2,
                            'joinExpr': joinExpr
                        }
                    else:
                        return True, {
                            'joinMethod': 'hash',
                            'lhsHashFn': hashFn2,
                            'lhsKeySchema': keySchema2,
                            'rhsHashFn': hashFn1,
                            'rhsKeySchema': keySchema1,
                            'joinExpr': joinExpr
                        }
        else:
            return False, None
Esempio n. 24
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  def localCost(self, estimated):
    tupleSizeLeft = self.lhsPlan.schema().size
    numTuplesLeft = self.lhsPlan.cardinality(estimated)
    tupleSizeRight = self.rhsPlan.schema().size
    numTuplesRight = self.rhsPlan.cardinality(estimated)
    pageSize = self.storage.bufferPool.pageSize

    numPagesLeft = (tupleSizeLeft * numTuplesLeft) // pageSize
    numPagesRight = (tupleSizeRight * numTuplesRight) // pageSize

    if self.joinMethod == "nested-loops":
      return (numTuplesLeft * self.tupleCost * numTuplesRight * self.tupleCost) + (numTuplesLeft * self.tupleCost)
      #return (numTuplesLeft * numPagesRight) + numPagesLeft
    elif self.joinMethod == "block-nested-loops":
      return (numTuplesLeft * self.tupleCost) + (((numTuplesLeft * self.tupleCost)// (self.storage.bufferPool.numPages() - 2)) * (numTuplesRight * self.tupleCost))
      #return numPagesLeft + ((numPagesLeft // (self.storage.bufferPool.numPages() - 2)) * numPagesRight)
    elif self.joinMethod == "indexed":
      raise NotImplementedError
    elif self.joinMethod == "hash":
      return 3 * ((numTuplesLeft * self.tupleCost) + (numTuplesRight * self.tupleCost))
    else:
      return None
  
  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    self.initializeOutput()
    self.partitionFiles = {0:{}, 1:{}}
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    for (lPageId, lhsPage) in self.lhsPlan:
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in self.rhsPlan:
          for rTuple in rhsPage:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            if eval(self.joinExpr, globals(), joinExprEnv):
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    pageBlock = []
    try:
      while True:
        (pageId, page) = next(pageIterator)
        pageBlock.append((pageId, page))
        bufPool.pinPage(pageId)
        if bufPool.numFreePages() == 0:
          break
    except StopIteration:
      pass

    return pageBlock

  def blockNestedLoops(self):
    # Access the outer relation's block, pinning pages in the buffer pool.
    bufPool    = self.storage.bufferPool
    lhsIter    = iter(self.lhsPlan)
    lPageBlock = self.accessPageBlock(bufPool, lhsIter)

    while lPageBlock:
      for (lPageId, lhsPage) in lPageBlock:
        for lTuple in lhsPage:
          # Load the lhs once per inner loop.
          joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

          for (rPageId, rhsPage) in self.rhsPlan:
            for rTuple in rhsPage:
              # Load the RHS tuple fields.
              joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

              # Evaluate the join predicate, and output if we have a match.
              if eval(self.joinExpr, globals(), joinExprEnv):
                outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
                self.emitOutputTuple(self.joinSchema.pack(outputTuple))

          # No need to track anything but the last output page when in batch mode.
          if self.outputPages:
            self.outputPages = [self.outputPages[-1]]

        # Unpin the page after joining with the RHS relation.
        # Thus future accesses can evict the page while reading the next block.
        bufPool.unpinPage(lPageId)

      # Move to the next page block after processing it.
      lPageBlock = self.accessPageBlock(bufPool, lhsIter)

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Indexed nested loops implementation
  #
  # TODO: test
  def indexedNestedLoops(self):
    if self.storage.getIndex(self.indexId) is None:
      raise ValueError("Missing index in storage manager: %s" % self.indexId)
    if self.indexId:
      bufPool = self.storage.bufferPool
      for (lPageId, lhsPage) in self.lhsPlan:
        for lTuple in lhsPage:
          # Load the lhs once per inner loop.
          joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

          # Match against RHS tuples using the index.
          joinKey = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
          matches = self.storage.fileMgr.lookupByIndex(self.rhsPlan.relationId(), self.indexId, joinKey)

          for rhsTupId in matches:
            rhsPage = bufPool.getPage(rhsTupId.pageId)
            rTuple  = rhsPage.getTuple(rhsTupId)

            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate any remaining join predicate, and output if we have a match.
            fullMatch = eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True
            if fullMatch:
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

          # No need to track anything but the last output page when in batch mode.
          if self.outputPages:
            self.outputPages = [self.outputPages[-1]]

      # Return an iterator to the output relation
      return self.storage.pages(self.relationId())

    else:
      raise ValueError("No index found while using an indexed nested loops join")


  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    # Partition the LHS and RHS inputs, creating a temporary file for each partition.
    # We assume one-level of partitioning is sufficient and skip recurring.
    for (lPageId, lPage) in self.lhsPlan:
      for lTuple in lPage:
        lPartEnv = self.loadSchema(self.lhsSchema, lTuple)
        lPartKey = eval(self.lhsHashFn, globals(), lPartEnv)
        self.emitPartitionTuple(lPartKey, lTuple, left=True)

    for (rPageId, rPage) in self.rhsPlan:
      for rTuple in rPage:
        rPartEnv = self.loadSchema(self.rhsSchema, rTuple)
        rPartKey = eval(self.rhsHashFn, globals(), rPartEnv)
        self.emitPartitionTuple(rPartKey, rTuple, left=False)

    # Iterate over partition pairs and output matches
    # evaluating the join expression as necessary.
    for ((lPageId, lPage), (rPageId, rPage)) in self.partitionPairs():
      for lTuple in lPage:
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)
        for rTuple in rPage:
          joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))
          output = \
            ( self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) \
                == self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) ) \
            and ( eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True )

          if output:
            outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
            self.emitOutputTuple(self.joinSchema.pack(outputTuple))

      # No need to track anything but the last output page when in batch mode.
      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    # Clean up partitions.
    self.removePartitionFiles()

    # Return an iterator to the output relation
    return self.storage.pages(self.relationId())

  # Hash join helpers.
  def partitionRelationId(self, left, partitionId):
    return self.operatorType() + str(self.id()) + "_" \
            + ("l" if left else "r") + "part_" + str(partitionId)

  def emitPartitionTuple(self, partitionId, partitionTuple, left=False):
    partRelId  = self.partitionRelationId(left, partitionId)
    partSchema = self.lhsSchema if left else self.rhsSchema

    # Create a partition file as needed.
    if not self.storage.hasRelation(partRelId):
      self.storage.createRelation(partRelId, partSchema)
      self.partitionFiles[int(left)][partitionId] = partRelId

    partFile = self.storage.fileMgr.relationFile(partRelId)[1]
    if partFile:
      partFile.insertTuple(partitionTuple)

  # Return pairs of pages from matching partitions.
  def partitionPairs(self):
    lKeys = self.partitionFiles[0].keys()
    rKeys = self.partitionFiles[1].keys()
    matches = [(self.partitionFiles[0][partId], self.partitionFiles[1][partId]) \
                for partId in lKeys if partId in rKeys]
    return PartitionIterator(matches, self.storage)

  # Delete all existing partition files.
  def removePartitionFiles(self):
    for lPartRelId in self.partitionFiles[0].values():
      self.storage.removeRelation(lPartRelId)

    for rPartRelId in self.partitionFiles[1].values():
      self.storage.removeRelation(rPartRelId)

    self.partitionFiles = {0:{}, 1:{}}


  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Esempio n. 25
0
class Join(Operator):
    def __init__(self, lhsPlan, rhsPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError("Pipelined join operator not supported")

        self.lhsPlan = lhsPlan
        self.rhsPlan = rhsPlan
        self.joinExpr = kwargs.get("expr", None)
        self.joinMethod = kwargs.get("method", None)
        self.lhsSchema = kwargs.get(
            "lhsSchema", None if lhsPlan is None else lhsPlan.schema())
        self.rhsSchema = kwargs.get(
            "rhsSchema", None if rhsPlan is None else rhsPlan.schema())

        self.lhsKeySchema = kwargs.get("lhsKeySchema", None)
        self.rhsKeySchema = kwargs.get("rhsKeySchema", None)
        self.lhsHashFn = kwargs.get("lhsHashFn", None)
        self.rhsHashFn = kwargs.get("rhsHashFn", None)

        self.validateJoin()
        self.initializeSchema()
        self.initializeMethod(**kwargs)

        self.pidsInBlock = list()

        self.tempFileHashR = dict()
        self.outputPageHashR = dict()

        self.tempFileHashL = dict()
        self.outputPageHashL = dict()

        self.tempFile = None

    # Checks the join parameters.
    def validateJoin(self):
        # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
        if self.joinMethod not in [
                "nested-loops", "block-nested-loops", "indexed", "hash"
        ]:
            raise ValueError("Invalid join method in join operator")

        # Check all fields are valid.
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            methodParams = [self.joinExpr]

        elif self.joinMethod == "indexed":
            methodParams = [self.lhsKeySchema]

        elif self.joinMethod == "hash":
            methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                    self.rhsHashFn, self.rhsKeySchema]

        requireAllValid = [self.lhsPlan, self.rhsPlan, \
                  self.joinMethod, \
                  self.lhsSchema, self.rhsSchema ] \
                  + methodParams

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete join specification, missing join operator parameter"
            )

        # For now, we assume that the LHS and RHS schema have
        # disjoint attribute names, enforcing this here.
        for lhsAttr in self.lhsSchema.fields:
            if lhsAttr in self.rhsSchema.fields:
                raise ValueError(
                    "Invalid join inputs, overlapping schema detected")

    # Initializes the output schema for this join.
    # This is a concatenation of all fields in the lhs and rhs schema.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.lhsSchema.schema() + self.rhsSchema.schema()
        self.joinSchema = DBSchema(schema, fields)

    # Initializes any additional operator parameters based on the join method.
    def initializeMethod(self, **kwargs):
        if self.joinMethod == "indexed":
            self.indexId = kwargs.get("indexId", None)
            if self.indexId is None or self.lhsKeySchema is None:
                raise ValueError("Invalid index for use in join operator")

    # Returns the output schema of this operator
    def schema(self):
        return self.joinSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.lhsSchema, self.rhsSchema]

    # Returns a string describing the operator type
    def operatorType(self):
        readableJoinTypes = {
            'nested-loops': 'NL',
            'block-nested-loops': 'BNL',
            'indexed': 'Index',
            'hash': 'Hash'
        }
        return readableJoinTypes[self.joinMethod] + "Join"

    # Returns child operators if present
    def inputs(self):
        return [self.lhsPlan, self.rhsPlan]

    # Iterator abstraction for join operator.
    def __iter__(self):
        self.initializeOutput()
        self.inputIteratorL = iter(self.lhsPlan)
        self.inputFinished = False

        if not self.pipelined:
            if self.joinMethod == 'hash':
                self.outputIterator = self.hashJoin()
            elif self.joinMethod == 'block-nested-loops':
                self.outputIterator = self.blockNestedLoops()
            elif self.joinMethod == 'nested-loops':
                self.outputIterator = self.nestedLoops()

        return self

    def __next__(self):
        self.inputIteratorR = iter(self.rhsPlan)
        if self.pipelined:
            while not (self.inputFinished or self.isOutputPageReady()):
                try:
                    lPageId, lhsPage = next(self.inputIteratorL)
                    for lTuple in lhsPage:
                        compare(lTuple)
                        if self.outputPages:
                            self.outputPages = [self.outputPages[-1]]
                except StopIteration:
                    self.inputFinished = True
            return self.outputPage()

        else:
            return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        if self.joinMethod == "nested-loops":
            return self.nestedLoops()

        elif self.joinMethod == "block-nested-loops":
            return self.blockNestedLoops()

        elif self.joinMethod == "indexed":
            return self.indexedNestedLoops()

        elif self.joinMethod == "hash":
            return self.hashJoin()

        else:
            raise ValueError("Invalid join method in join operator")

    ##################################
    #
    # Nested loops implementation
    #
    def nestedLoops(self):
        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for lTuple in lhsPage:
                # Load the lhs once per inner loop.
                joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                for (rPageId, rhsPage) in iter(self.rhsPlan):
                    for rTuple in rhsPage:
                        # Load the RHS tuple fields.
                        joinExprEnv.update(
                            self.loadSchema(self.rhsSchema, rTuple))

                        # Evaluate the join predicate, and output if we have a match.
                        if eval(self.joinExpr, globals(), joinExprEnv):
                            outputTuple = self.joinSchema.instantiate(*[
                                joinExprEnv[f] for f in self.joinSchema.fields
                            ])
                            self.emitOutputTuple(
                                self.joinSchema.pack(outputTuple))

                # No need to track anything but the last output page when in batch mode.
                #compare(lTuple)
                if self.outputPages:
                    self.outputPages = [self.outputPages[-1]]

        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    def compare(lTuple):
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        for (rPageId, rhsPage) in iter(self.rhsPlan):
            for rTuple in rhsPage:
                # Load the RHS tuple fields.
                joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

                # Evaluate the join predicate, and output if we have a match.
                if eval(self.joinExpr, globals(), joinExprEnv):
                    outputTuple = self.joinSchema.instantiate(
                        *[joinExprEnv[f] for f in self.joinSchema.fields])
                    self.emitOutputTuple(self.joinSchema.pack(outputTuple))

    ##################################
    #
    # Block nested loops implementation
    #
    # This attempts to use all the free pages in the buffer pool
    # for its block of the outer relation.

    # Accesses a block of pages from an iterator.
    # This method pins pages in the buffer pool during its access.
    # We track the page ids in the block to unpin them after processing the block.
    def accessPageBlock(self, bufPool, pageIterator):
        for pid in self.pidsInBlock:
            bufPool.unpinPage(pid)
        self.pidsInBlock = list()
        M = bufPool.freeSpace()
        for i in range(0, M - 2):
            try:
                (pid, page) = next(pageIterator)
            except:
                break
            #if pid is None:
            #	break
            self.pidsInBlock.append(pid)
            bufPool.getPage(pid, pinned=True)
            bufPool.pinPage(pid)

    def blockNestedLoops(self):
        riter = iter(self.rhsPlan)
        buf = self.storage.bufferPool
        while riter.hasNext():
            self.accessPageBlock(buf, riter)
            for (lPageId, lhsPage) in iter(self.lhsPlan):
                for lTuple in lhsPage:
                    # Load the lhs once per inner loop.
                    joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

                    for pid in self.pidsInBlock:
                        rhsPage = buf.getPage(pid, pinned=True)
                        for rTuple in rhsPage:
                            # Load the RHS tuple fields.
                            joinExprEnv.update(
                                self.loadSchema(self.rhsSchema, rTuple))

                            # Evaluate the join predicate, and output if we have a match.
                            if eval(self.joinExpr, globals(), joinExprEnv):
                                outputTuple = self.joinSchema.instantiate(*[
                                    joinExprEnv[f]
                                    for f in self.joinSchema.fields
                                ])
                                self.emitOutputTuple(
                                    self.joinSchema.pack(outputTuple))

                    # No need to track anything but the last output page when in batch mode.
                    if self.outputPages:
                        self.outputPages = [self.outputPages[-1]]
        self.accessPageBlock(buf, riter)
        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    ##################################
    #
    # Indexed nested loops implementation
    #
    # TODO: test
    def indexedNestedLoops(self):
        raise NotImplementedError

    ##################################
    #
    # Hash join implementation.
    #
    def hashJoin(self):
        for (rPageId, rhsPage) in iter(self.rhsPlan):
            for tuple in rhsPage:
                val = self.loadSchema(self.rhsSchema, tuple)
                hash = eval(self.rhsHashFn, globals(), val)
                self.emitOutputTupleHash(tuple, hash, False)

        for (lPageId, lhsPage) in iter(self.lhsPlan):
            for tuple in lhsPage:
                val = self.loadSchema(self.lhsSchema, tuple)
                hash = eval(self.lhsHashFn, globals(), val)
                self.emitOutputTupleHash(tuple, hash, True)

        evalStr = ''
        for i, lt in enumerate(self.lhsKeySchema.schema()):
            rt = self.rhsKeySchema.schema()[i]
            evalStr += str(lt[0]) + '==' + str(rt[0])
            if i != 0 and i != len(self.lhsKeySchema.schema()) - 1:
                evalStr += ' and '
        if self.joinExpr is not None:
            evalStr += ' and ' + self.joinExpr
        for lk in self.outputPageHashL.keys():
            for rk in self.outputPageHashR.keys():

                riter = iter(self.outputPageHashR[rk])
                buf = self.storage.bufferPool
                M = buf.freeSpace() - 2
                size = len(self.outputPageHashR[rk])
                while size > 0:
                    self.accessPageBlock(buf, riter)
                    size -= M
                    for (lPageId, lhsPage) in iter(self.outputPageHashL[lk]):
                        for lTuple in lhsPage:
                            # Load the lhs once per inner loop.
                            joinExprEnv = self.loadSchema(
                                self.lhsSchema, lTuple)

                            for pid in self.pidsInBlock:
                                rhsPage = buf.getPage(pid, pinned=True)
                                for rTuple in rhsPage:
                                    # Load the RHS tuple fields.
                                    joinExprEnv.update(
                                        self.loadSchema(
                                            self.rhsSchema, rTuple))

                                    # Evaluate the join predicate, and output if we have a match.
                                    if eval(evalStr, globals(), joinExprEnv):
                                        outputTuple = self.joinSchema.instantiate(
                                            *[
                                                joinExprEnv[f]
                                                for f in self.joinSchema.fields
                                            ])
                                        self.emitOutputTuple(
                                            self.joinSchema.pack(outputTuple))

                            # No need to track anything but the last output page when in batch mode.
                            if self.outputPages:
                                self.outputPages = [self.outputPages[-1]]
                self.accessPageBlock(buf, riter)
        # Return an iterator to the output relation
        return self.storage.pages(self.relationId())

    def getRelId(self, hashVal, isLeft):
        tempstr = 'temp'
        if isLeft:
            tempstr = 'templ'
        return self.relationId() + tempstr + str(hashVal)

    def initializeOutputHash(self, hashVal, isLeft):
        relId = self.getRelId(hashVal, isLeft)

        if self.storage.hasRelation(relId):
            self.storage.removeRelation(relId)

        if isLeft:
            self.storage.createRelation(relId, self.lhsSchema)
            self.tempFileHashL[hashVal] = self.storage.fileMgr.relationFile(
                relId)[1]
            self.outputPageHashL[hashVal] = []
        else:
            self.storage.createRelation(relId, self.rhsSchema)
            self.tempFileHashR[hashVal] = self.storage.fileMgr.relationFile(
                relId)[1]
            self.outputPageHashR[hashVal] = []

    def emitOutputTupleHash(self, tupleData, hashVal, isLeft):
        if isLeft:
            if hashVal not in self.tempFileHashL.keys():
                self.initializeOutputHash(hashVal, isLeft)
        else:
            if hashVal not in self.tempFileHashR.keys():
                self.initializeOutputHash(hashVal, isLeft)

        self.currFile = self.tempFileHashR[hashVal]
        self.currOutputPages = self.outputPageHashR[hashVal]
        if isLeft:
            self.currFile = self.tempFileHashL[hashVal]
            self.currOutputPages = self.outputPageHashL[hashVal]

        allocatePage = not (
            self.currOutputPages and
            (self.currOutputPages)[-1][1].header.hasFreeTuple())
        if allocatePage:
            # Flush the most recently updated output page, which updates the storage file's
            # free page list to ensure correct new page allocation.
            if self.currOutputPages:
                self.storage.bufferPool.flushPage(
                    (self.currOutputPages)[-1][0])
            outputPageId = self.currFile.availablePage()
            outputPage = self.storage.bufferPool.getPage(outputPageId)
            self.currOutputPages.append((outputPageId, outputPage))
        else:
            outputPage = (self.currOutputPages)[-1][1]

        outputPage.insertTuple(tupleData)

        if self.sampled:
            self.estimatedCardinality += 1
        else:
            self.actualCardinality += 1

    def printerr(self, string):
        f = open('err.txt', 'a')
        f.write(str(string) + '\n')
        f.close()

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
            exprs = "(expr='" + str(self.joinExpr) + "')"

        elif self.joinMethod == "indexed":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")"

        elif self.joinMethod == "hash":
            exprs = "(" + ','.join(
                filter(lambda x: x is not None, ([
                    "expr='" + str(self.joinExpr) +
                    "'" if self.joinExpr else None
                ] + [
                    "lhsKeySchema=" + self.lhsKeySchema.toString(),
                    "rhsKeySchema=" + self.rhsKeySchema.toString(),
                    "lhsHashFn='" + self.lhsHashFn + "'",
                    "rhsHashFn='" + self.rhsHashFn + "'"
                ]))) + ")"

        return super().explain() + exprs
Esempio n. 26
0
from Storage.BufferPool import BufferPool
from Catalog.Schema import DBSchema
from Storage.StorageEngine import StorageEngine

schemas = [
        DBSchema('part', 
           [('p_partkey', 'int'),
           ('p_name',    'char(55)'),
           ('p_mfgr',    'char(25)'),
           ('p_brand',   'char(10)'),
           ('p_type',    'char(25)'),
           ('p_size',    'int'),
           ('p_container', 'char(10)'),
           ('p_retailprice', 'float'),
           ('p_comment', 'char(23)')])
        ,
        DBSchema('supplier',
            [('s_suppkey', 'int'),
            ('s_name', 'char(25)'),
            ('s_address', 'char(40)'),
            ('s_nationkey', 'int'),
            ('s_phone', 'char(15)'),
            ('s_acctbal', 'float'),
            ('s_comment', 'char(101)')])
        ,
        DBSchema('partsupp',
            [('ps_partkey', 'int'),
            ('ps_suppkey', 'int'),
            ('ps_availqty', 'int'),
            ('ps_supplycost', 'float'),
            ('ps_comment', 'char(199)')])
Esempio n. 27
0
class GroupBy(Operator):
  def __init__(self, subPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined group-by-aggregate operator not supported")

    self.subPlan     = subPlan
    self.subSchema   = subPlan.schema()
    self.groupSchema = kwargs.get("groupSchema", None)
    self.aggSchema   = kwargs.get("aggSchema", None)
    self.groupExpr   = kwargs.get("groupExpr", None)
    self.aggExprs    = kwargs.get("aggExprs", None)
    self.groupHashFn = kwargs.get("groupHashFn", None)

    self.validateGroupBy()
    self.initializeSchema()

   
  def localCost(self, estimated):
    tupleSize = self.subPlan.schema().size
    numTuples = self.subPlan.cardinality(estimated)
    pageSize = self.storage.bufferPool.pageSize
    numPages = (tupleSize * numTuples) // pageSize
 
    return 2 * numTuples * self.tupleCost
    #return 2 * numPages #derived from: http://www4.comp.polyu.edu.hk/~csmlyiu/conf/CIKM09_skygroup.pdf with the assumption that G=1 and therefore the log value will be close to 1


  # Perform some basic checking on the group-by operator's parameters.
  def validateGroupBy(self):
    requireAllValid = [self.subPlan, \
                       self.groupSchema, self.aggSchema, \
                       self.groupExpr, self.aggExprs, self.groupHashFn ]

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete group-by specification, missing a required parameter")

    if not self.aggExprs:
      raise ValueError("Group-by needs at least one aggregate expression")

    if len(self.aggExprs) != len(self.aggSchema.fields):
      raise ValueError("Invalid aggregate fields: schema mismatch")

  # Initializes the group-by's schema as a concatenation of the group-by
  # fields and all aggregate fields.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.groupSchema.schema() + self.aggSchema.schema()
    self.outputSchema = DBSchema(schema, fields)

  # Returns the output schema of this operator
  def schema(self):
    return self.outputSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.subPlan.schema()]

  # Returns a string describing the operator type
  def operatorType(self):
    return "GroupBy"

  # Returns child operators if present
  def inputs(self):
    return [self.subPlan]

  # Iterator abstraction for selection operator.
  def __iter__(self):
    self.initializeOutput()
    self.partitionFiles = {}
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)


  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Processing helpers
  def ensureTuple(self, x):
    if not isinstance(x, tuple):
      return (x,)
    else:
      return x

  def initialExprs(self):
    return [i[0] for i in self.aggExprs]

  def incrExprs(self):
    return [i[1] for i in self.aggExprs]

  def finalizeExprs(self):
    return [i[2] for i in self.aggExprs]

  # Set-at-a-time operator processing
  def processAllPages(self):
    # Create partitions of the input records by hashing the group-by values
    for (pageId, page) in self.subPlan:
      for tup in page:
        groupVal = self.ensureTuple(self.groupExpr(self.subSchema.unpack(tup)))
        groupId = self.groupHashFn(groupVal)
        self.emitPartitionTuple(groupId, tup)

    # We assume that the partitions fit in main memory.
    for partRelId in self.partitionFiles.values():
      partFile = self.storage.fileMgr.relationFile(partRelId)[1]

      # Use an in-memory Python dict to accumulate the aggregates.
      aggregates = {}
      for (pageId, page) in partFile.pages():
        for tup in page:
          # Evaluate group-by value.
          namedTup = self.subSchema.unpack(tup)
          groupVal = self.ensureTuple(self.groupExpr(namedTup))

          # Look up the aggregate for the group.
          if groupVal not in aggregates:
            aggregates[groupVal] = self.initialExprs()

          # Increment the aggregate.
          aggregates[groupVal] = \
            list(map( \
              lambda x: x[0](x[1], namedTup), \
              zip(self.incrExprs(), aggregates[groupVal])))

      # Finalize the aggregate value for each group.
      for (groupVal, aggVals) in aggregates.items():
        finalVals = list(map(lambda x: x[0](x[1]), zip(self.finalizeExprs(), aggVals)))
        outputTuple = self.outputSchema.instantiate(*(list(groupVal) + finalVals))
        self.emitOutputTuple(self.outputSchema.pack(outputTuple))

      # No need to track anything but the last output page when in batch mode.
      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    # Clean up partitions.
    self.removePartitionFiles()

    # Return an iterator for the output file.
    return self.storage.pages(self.relationId())

  # Bucket construction helpers.
  def partitionRelationId(self, partitionId):
    return self.operatorType() + str(self.id()) + "_" \
            + "part_" + str(partitionId)

  def emitPartitionTuple(self, partitionId, partitionTuple):
    partRelId  = self.partitionRelationId(partitionId)

    # Create a partition file as needed.
    if not self.storage.hasRelation(partRelId):
      self.storage.createRelation(partRelId, self.subSchema)
      self.partitionFiles[partitionId] = partRelId

    partFile = self.storage.fileMgr.relationFile(partRelId)[1]
    if partFile:
      partFile.insertTuple(partitionTuple)

  # Delete all existing partition files.
  def removePartitionFiles(self):
    for partRelId in self.partitionFiles.values():
      self.storage.removeRelation(partRelId)
    self.partitionFiles = {}


  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                             + ", aggSchema=" + self.aggSchema.toString() + ")"
Esempio n. 28
0
    def initializeSchemas(self):
        tpchNamesAndFields = [
            ('part', [('P_PARTKEY', 'int'), ('P_NAME', 'char(55)'),
                      ('P_MFGR', 'char(25)'), ('P_BRAND', 'char(10)'),
                      ('P_TYPE', 'char(25)'), ('P_SIZE', 'int'),
                      ('P_CONTAINER', 'char(10)'), ('P_RETAILPRICE', 'double'),
                      ('P_COMMENT', 'char(23)')], "issssisds"),
            ('supplier', [('S_SUPPKEY', 'int'), ('S_NAME', 'char(25)'),
                          ('S_ADDRESS', 'char(40)'), ('S_NATIONKEY', 'int'),
                          ('S_PHONE', 'char(15)'), ('S_ACCTBAL', 'double'),
                          ('S_COMMENT', 'char(101)')], "issisds"),
            ('partsupp', [('PS_PARTKEY', 'int'), ('PS_SUPPKEY', 'int'),
                          ('PS_AVAILQTY', 'int'), ('PS_SUPPLYCOST', 'double'),
                          ('PS_COMMENT', 'char(199)')], "iiids"),
            ('customer', [('C_CUSTKEY', 'int'), ('C_NAME', 'char(25)'),
                          ('C_ADDRESS', 'char(40)'), ('C_NATIONKEY', 'int'),
                          ('C_PHONE', 'char(15)'), ('C_ACCTBAL', 'double'),
                          ('C_MKTSEGMENT', 'char(10)'),
                          ('C_COMMENT', 'char(117)')], "issisdss"),
            (
                'orders',
                [
                    ('O_ORDERKEY', 'int'),
                    ('O_CUSTKEY', 'int'),
                    ('O_ORDERSTATUS', 'char(1)'),
                    ('O_TOTALPRICE', 'double'),
                    ('O_ORDERDATE', 'int'),  # date
                    ('O_ORDERPRIORITY', 'char(15)'),
                    ('O_CLERK', 'char(15)'),
                    ('O_SHIPPRIORITY', 'int'),
                    ('O_COMMENT', 'char(79)')
                ],
                "iisdtssis"),
            (
                'lineitem',
                [
                    ('L_ORDERKEY', 'int'),
                    ('L_PARTKEY', 'int'),
                    ('L_SUPPKEY', 'int'),
                    ('L_LINENUMBER', 'int'),
                    ('L_QUANTITY', 'double'),
                    ('L_EXTENDEDPRICE', 'double'),
                    ('L_DISCOUNT', 'double'),
                    ('L_TAX', 'double'),
                    ('L_RETURNFLAG', 'char(1)'),
                    ('L_LINESTATUS', 'char(1)'),
                    ('L_SHIPDATE', 'int'),  # date
                    ('L_COMMITDATE', 'int'),  # date
                    ('L_RECEIPTDATE', 'int'),  # date
                    ('L_SHIPINSTRUCT', 'char(25)'),
                    ('L_SHIPMODE', 'char(10)'),
                    ('L_COMMENT', 'char(44)')
                ],
                "iiiiddddsstttsss"),
            ('nation', [('N_NATIONKEY', 'int'), ('N_NAME', 'char(25)'),
                        ('N_REGIONKEY', 'int'),
                        ('N_COMMENT', 'char(152)')], "isis"),
            ('region', [('R_REGIONKEY', 'int'), ('R_NAME', 'char(25)'),
                        ('R_COMMENT', 'char(152)')], "iss")
        ]

        self.schemas = dict(
            map(lambda x: (x[0], DBSchema(x[0], x[1])), tpchNamesAndFields))
        self.parsers = dict(
            map(lambda x: (x[0], self.buildParser(x[2])), tpchNamesAndFields))
Esempio n. 29
0
class Join(Operator):
  def __init__(self, lhsPlan, rhsPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined join operator not supported")

    self.lhsPlan    = lhsPlan
    self.rhsPlan    = rhsPlan
    self.joinExpr   = kwargs.get("expr", None)
    self.joinMethod = kwargs.get("method", None)
    self.lhsSchema  = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema())
    self.rhsSchema  = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema())

    self.lhsKeySchema   = kwargs.get("lhsKeySchema", None)
    self.rhsKeySchema   = kwargs.get("rhsKeySchema", None)
    self.lhsHashFn      = kwargs.get("lhsHashFn", None)
    self.rhsHashFn      = kwargs.get("rhsHashFn", None)

    self.validateJoin()
    self.initializeSchema()
    self.initializeMethod(**kwargs)

  # Checks the join parameters.
  def validateJoin(self):
    # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash"
    if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]:
      raise ValueError("Invalid join method in join operator")

    # Check all fields are valid.
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      methodParams = [self.joinExpr]

    elif self.joinMethod == "indexed":
      methodParams = [self.lhsKeySchema]

    elif self.joinMethod == "hash":
      methodParams = [self.lhsHashFn, self.lhsKeySchema, \
                      self.rhsHashFn, self.rhsKeySchema]

    requireAllValid = [self.lhsPlan, self.rhsPlan, \
                       self.joinMethod, \
                       self.lhsSchema, self.rhsSchema ] \
                       + methodParams

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete join specification, missing join operator parameter")

    # For now, we assume that the LHS and RHS schema have
    # disjoint attribute names, enforcing this here.
    for lhsAttr in self.lhsSchema.fields:
      if lhsAttr in self.rhsSchema.fields:
        raise ValueError("Invalid join inputs, overlapping schema detected")


  # Initializes the output schema for this join.
  # This is a concatenation of all fields in the lhs and rhs schema.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.lhsSchema.schema() + self.rhsSchema.schema()
    self.joinSchema = DBSchema(schema, fields)

  # Initializes any additional operator parameters based on the join method.
  def initializeMethod(self, **kwargs):
    if self.joinMethod == "indexed":
      self.indexId = kwargs.get("indexId", None)
      if self.indexId is None or self.lhsKeySchema is None:
        raise ValueError("Invalid index for use in join operator")

  # Returns the output schema of this operator
  def schema(self):
    return self.joinSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.lhsSchema, self.rhsSchema]

  # Returns a string describing the operator type
  def operatorType(self):
    readableJoinTypes = { 'nested-loops'       : 'NL'
                        , 'block-nested-loops' : 'BNL'
                        , 'indexed'            : 'Index'
                        , 'hash'               : 'Hash' }
    return readableJoinTypes[self.joinMethod] + "Join"

  # Returns child operators if present
  def inputs(self):
    return [self.lhsPlan, self.rhsPlan]

  # Iterator abstraction for join operator.
  def __iter__(self):
    self.initializeOutput()
    # Pipelined join operator is not supported according to constructor
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    if self.joinMethod == "nested-loops":
      return self.nestedLoops()

    elif self.joinMethod == "block-nested-loops":
      return self.blockNestedLoops()

    elif self.joinMethod == "indexed":
      return self.indexedNestedLoops()

    elif self.joinMethod == "hash":
      return self.hashJoin()

    else:
      raise ValueError("Invalid join method in join operator")

  # Return an iterator to the output relation
  def outputRelationIterator(self):
    return self.storage.pages(self.relationId())


  ##################################
  #
  # Nested loops implementation
  #
  def nestedLoops(self):
    self.runNestedLoops(iter(self.lhsPlan), iter(self.rhsPlan), False, False, False)
    # Return an iterator to the output relation
    return self.outputRelationIterator()

  # Common function used by all types of joins
  def runNestedLoops(self, lhsPageIter, rhsPageIter, isBlock, isIndex, isHash):
    for (lPageId, lhsPage) in lhsPageIter:
      for lTuple in lhsPage:
        # Load the lhs once per inner loop.
        joinExprEnv = self.loadSchema(self.lhsSchema, lTuple)

        if isIndex:
          keyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
          idxManager = self.storage.fileMgr.indexManager
          rhsPageIter = idxManager.lookupByIndex(self.indexId, keyData)

        for rhsItem in rhsPageIter:
          rhsTupleIter = None

          if isIndex:
            # Retrieve index-matched tuple from corresponding page
            page = self.storage.bufferPool.getPage(rhsItem.pageId) # rhsItem = rhsTupId
            rhsTupleIter = [page.getTuple(rhsItem)]
          else:
            # Need to scan all tuples
            rhsTupleIter = rhsItem[1] # rhsItem = (rPageId, rhsPage)

          for rTuple in rhsTupleIter:
            # Load the RHS tuple fields.
            joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple))

            # Evaluate the join predicate, and output if we have a match.
            validJoin = False

            if isIndex:
              validJoin = True
            else:
              if isHash:
                lhsKeyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema)
                rhsKeyData = self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema)
                validJoin = lhsKeyData == rhsKeyData
              else:
                validJoin = True

            if self.joinExpr:
              validJoin = validJoin and eval(self.joinExpr, globals(), joinExprEnv)

            if validJoin:
              outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields])
              self.emitOutputTuple(self.joinSchema.pack(outputTuple))

        # No need to track anything but the last output page when in batch mode.
        if self.outputPages:
          self.outputPages = [self.outputPages[-1]]
      
      if isBlock:
        self.storage.bufferPool.unpinPage(lPageId)

  ##################################
  #
  # Block nested loops implementation
  #
  # This attempts to use all the free pages in the buffer pool
  # for its block of the outer relation.

  # Accesses a block of pages from an iterator.
  # This method pins pages in the buffer pool during its access.
  # We track the page ids in the block to unpin them after processing the block.
  def accessPageBlock(self, bufPool, pageIterator):
    pinnedPages = list()
    try:
      while bufPool.numFreePages() > 0:
        (lPageId, lhsPage) = next(pageIterator)
        bufPool.pinPage(lPageId)
        pinnedPages.append((lPageId, lhsPage))
    except StopIteration:
      pass
    return pinnedPages

  def pinPages(self, pageIterator):
    return self.accessPageBlock(self.storage.bufferPool, pageIterator)

  def blockNestedLoops(self):
    self.runBlockNestedLoops(iter(self.lhsPlan), self.rhsPlan, False)
    return self.outputRelationIterator()

  def runBlockNestedLoops(self, lhsPageIter, rhsPageIter, isHashJoin):
    pinnedPages = self.pinPages(lhsPageIter)
    # Keep running untill ALL pages have been loaded
    # Note: 'rhsPageIter' should be 'list' type NOT 'iter'
    while len(pinnedPages) > 0:
      self.runNestedLoops(iter(pinnedPages), rhsPageIter, True, False, isHashJoin)
      pinnedPages = self.pinPages(lhsPageIter)

  ##################################
  #
  # Indexed nested loops implementation
  #
  def indexedNestedLoops(self):
    self.runNestedLoops(iter(self.lhsPlan), None, False, True, False)
    return self.outputRelationIterator()

  ##################################
  #
  # Hash join implementation.
  #
  def hashJoin(self):
    lhsRelIdMap = {}
    rhsRelIdMap = {}

    # Partition each relation using hash function
    self.partition(self.lhsPlan, self.lhsHashFn, self.lhsSchema, lhsRelIdMap, "lhs")
    self.partition(self.rhsPlan, self.rhsHashFn, self.rhsSchema, rhsRelIdMap, "rhs")

    # Perform block nested loop join for each bucket
    for hashValue, relId in lhsRelIdMap.items():
      lhsPageIter = self.storage.pages(relId)
      rhsPageIter = self.storage.pages(rhsRelIdMap[hashValue])

      self.runBlockNestedLoops(lhsPageIter, list(rhsPageIter), True)

    # Remove partitions
    partitionIter = itertools.chain(lhsRelIdMap.items(), rhsRelIdMap.items())
    for _, relId in partitionIter:
      self.storage.removeRelation(relId)

    return self.outputRelationIterator()
        
  # Partitions a given relation based on some hash function 
  def partition(self, plan, hashFn, schema, relIdMap, relPrefix):
    for (pageId, page) in iter(plan):
      for tuple in page:
        # Compute hash value for every tuple
        fieldBindings = self.loadSchema(schema, tuple)
        hashValue = eval(hashFn, globals(), fieldBindings)

        # Store in temporary buckets (files)
        if not hashValue in relIdMap:
          relId = str(self.id()) + "_" + relPrefix + "_" + str(hashValue)
          self.storage.createRelation(relId, schema)
          relIdMap[hashValue] = relId

        self.storage.insertTuple(relIdMap[hashValue], tuple)          

  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops":
      exprs = "(expr='" + str(self.joinExpr) + "')"

    elif self.joinMethod == "indexed":
      exprs =  "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "indexKeySchema=" + self.lhsKeySchema.toString() ]
        ))) + ")"

    elif self.joinMethod == "hash":
      exprs = "(" + ','.join(filter(lambda x: x is not None, (
          [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ]
        + [ "lhsKeySchema=" + self.lhsKeySchema.toString() ,
            "rhsKeySchema=" + self.rhsKeySchema.toString() ,
            "lhsHashFn='" + self.lhsHashFn + "'" ,
            "rhsHashFn='" + self.rhsHashFn + "'" ]
        ))) + ")"

    return super().explain() + exprs
Esempio n. 30
0
from
        customer,
        orders,
        lineitem
where
        c_mktsegment = 'BUILDING'
        and c_custkey = o_custkey
        and l_orderkey = o_orderkey
        and o_orderdate < 19950315
        and l_shipdate > 19950315
group by
        l_orderkey,
        o_orderdate,
        o_shippriority
'''
ls1 = DBSchema('customerKey1', [('C_CUSTKEY', 'int')])
rs1 = DBSchema('customerKey2', [('O_CUSTKEY', 'int')])

ls2 = DBSchema('orderKey1', [('O_ORDERKEY', 'int')])
rs2 = DBSchema('orderkey2', [('L_ORDERKEY', 'int')])

groupKeySchema = DBSchema('groupKey', [('L_ORDERKEY', 'int'),
                                       ('O_ORDERDATE', 'int'),
                                       ('O_SHIPPRIORITY', 'int')])
groupAggSchema = DBSchema('groupAgg', [('revenue', 'float')])

query3 = db.query().fromTable('customer').join( \
            db.query().fromTable('orders'),
            method = 'hash', \
            lhsHashFn = 'hash(C_CUSTKEY) % 5', lhsKeySchema = ls1, \
            rhsHashFn = 'hash(O_CUSTKEY) % 5', rhsKeySchema = rs1).join( \
Esempio n. 31
0
import io, math, os, os.path, pickle, struct
from struct import Struct

from Catalog.Identifiers import PageId, FileId, TupleId
from Catalog.Schema      import DBSchema
from Storage.Page        import PageHeader, Page
from Storage.SlottedPage import SlottedPageHeader, SlottedPage

from Storage.File import *

import shutil, Storage.BufferPool, Storage.FileManager

schema = DBSchema('employee', [('id', 'int'), ('age', 'int')])

bp = Storage.BufferPool.BufferPool()

fm = Storage.FileManager.FileManager(bufferPool=bp)

bp.setFileManager(fm)

fm.createRelation(schema.name, schema)

(fId, f) = fm.relationFile(schema.name)

f.numPages() == 0

pId  = PageId(fId, 0)
pId1 = PageId(fId, 1)

p    = Page(pageId=pId,  buffer=bytes(f.pageSize()), schema=schema)
p1   = Page(pageId=pId1, buffer=bytes(f.pageSize()), schema=schema)
Esempio n. 32
0
 def initializeSchema(self):
     schema = self.operatorType() + str(self.id())
     fields = self.lhsSchema.schema() + self.rhsSchema.schema()
     self.joinSchema = DBSchema(schema, fields)
Esempio n. 33
0
    def joinsOptimizer(self, operator, aPaths):
        defaultScaleFactor = 10
        defaultPartiNumber = 5

        n = len(aPaths)
        planList = []
        costList = []
        # i = 1
        for aPath in aPaths:
            # Here we define cost by number of pages.
            cards = Plan(root=aPath).sample(defaultScaleFactor)
            pageSize, _, _ = self.db.storage.relationStats(aPath.relationId())
            numPages = cards / (pageSize / aPath.schema().size)
            # Here we only consider reorganize joins
            # so that we simple put accessPaths' total cost as 0.
            planList.append(aPath)
            costList.append((numPages, 0))
        # i = 2...n
        for i in range(1, n):
            # find all possible two way join in current planList
            # put the potential joins in potentialP
            # put the potential joins cost in potentialC
            m = len(planList)
            potentialP = []
            potentialC = []
            for j in range(0, m - 1):
                for k in range(j + 1, m):
                    self.pcntr += 1
                    potentialP.append((planList[j], planList[k]))
                    potentialC.append(3 * (costList[j][0] + costList[k][0]) +
                                      costList[j][1] + costList[k][1])
            # find the cheapest joinable join (total cost)
            # build the join, remove the used two base plan and add the new join to planList
            # modify the costList as well
            while (potentialC):
                currC = min(potentialC)
                currP = potentialP[potentialC.index(currC)]
                potentialC.remove(currC)
                potentialP.remove(currP)
                if (self.joinable(operator, currP)):
                    (lField, rField) = self.joinable(operator, currP)
                    lhsSchema = currP[0].schema()
                    rhsSchema = currP[1].schema()
                    lKeySchema = DBSchema(
                        'left',
                        [(f, t)
                         for (f, t) in lhsSchema.schema() if f == lField])
                    rKeySchema = DBSchema(
                        'right',
                        [(f, t)
                         for (f, t) in rhsSchema.schema() if f == rField])
                    lHashFn = 'hash(' + lField + ') % ' + str(
                        defaultPartiNumber)
                    rHashFn = 'hash(' + rField + ') % ' + str(
                        defaultPartiNumber)
                    newJoin = Join(currP[0], currP[1], method='hash', \
                                   lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \
                                   rhsHashFn=rHashFn, rhsKeySchema=rKeySchema)

                    newJoin.prepare(self.db)
                    totalCost = currC
                    cards = Plan(root=newJoin).sample(defaultScaleFactor)
                    pageSize, _, _ = self.db.storage.relationStats(
                        newJoin.relationId())
                    pages = cards / (pageSize / newJoin.schema().size)

                    id1 = planList.index(currP[0])
                    _ = planList.pop(id1)
                    id2 = planList.index(currP[1])
                    _ = planList.pop(id2)
                    planList.append(newJoin)
                    _ = costList.pop(id1)
                    _ = costList.pop(id2)
                    costList.append((pages, totalCost))
                    break
        print("GreedyOptimizer plan considered: ", self.pcntr)
        return planList[0]
Esempio n. 34
0
 def initializeSchema(self):
   schema = self.operatorType() + str(self.id())
   fields = self.lhsSchema.schema() + self.rhsSchema.schema()
   self.joinSchema = DBSchema(schema, fields)
Esempio n. 35
0
class GroupBy(Operator):
    def __init__(self, subPlan, **kwargs):
        super().__init__(**kwargs)

        if self.pipelined:
            raise ValueError(
                "Pipelined group-by-aggregate operator not supported")

        self.subPlan = subPlan
        self.subSchema = subPlan.schema()
        self.groupSchema = kwargs.get("groupSchema", None)
        self.aggSchema = kwargs.get("aggSchema", None)
        self.groupExpr = kwargs.get("groupExpr", None)
        self.aggExprs = kwargs.get("aggExprs", None)
        self.groupHashFn = kwargs.get("groupHashFn", None)
        self.validateGroupBy()
        self.initializeSchema()

    # Perform some basic checking on the group-by operator's parameters.
    def validateGroupBy(self):
        requireAllValid = [self.subPlan, \
                           self.groupSchema, self.aggSchema, \
                           self.groupExpr, self.aggExprs, self.groupHashFn ]

        if any(map(lambda x: x is None, requireAllValid)):
            raise ValueError(
                "Incomplete group-by specification, missing a required parameter"
            )

        if not self.aggExprs:
            raise ValueError(
                "Group-by needs at least one aggregate expression")

        if len(self.aggExprs) != len(self.aggSchema.fields):
            raise ValueError("Invalid aggregate fields: schema mismatch")

    # Initializes the group-by's schema as a concatenation of the group-by
    # fields and all aggregate fields.
    def initializeSchema(self):
        schema = self.operatorType() + str(self.id())
        fields = self.groupSchema.schema() + self.aggSchema.schema()
        self.outputSchema = DBSchema(schema, fields)

    # Returns the output schema of this operator
    def schema(self):
        return self.outputSchema

    # Returns any input schemas for the operator if present
    def inputSchemas(self):
        return [self.subPlan.schema()]

    # Returns a string describing the operator type
    def operatorType(self):
        return "GroupBy"

    # Returns child operators if present
    def inputs(self):
        return [self.subPlan]

    # Iterator abstraction for selection operator.
    def __iter__(self):
        self.initializeOutput()
        # Pipelined join operator is not supported
        self.outputIterator = self.processAllPages()

        return self

    def __next__(self):
        return next(self.outputIterator)

    # Page-at-a-time operator processing
    def processInputPage(self, pageId, page):
        raise ValueError("Page-at-a-time processing not supported for joins")

    # Set-at-a-time operator processing
    def processAllPages(self):
        Map = dict()
        # partition the schema into several files in different attributes
        self.partition(Map)

        for key, title in Map.items():
            # Generate a pageIterator in the file
            pageIterator = self.storage.pages(title)

            # Generate an dictionary on intermediate aggregation results
            aggregator = {}
            # Get the tuple in the page
            for _, page in pageIterator:
                for Tuple in page:
                    tuple_Unpacked = self.subSchema.unpack(Tuple)
                    key = self.groupExpr(tuple_Unpacked)
                    if type(key) is tuple:
                        key = key
                    else:
                        key = key,

                    val = self.groupHashFn(key)
                    intermediate_results = aggregator.get(val, None)

                    # if the intermediate_result has not generated, form one
                    if not intermediate_results:
                        intermediate_results = list()
                        aggregator[val] = intermediate_results
                        for aggExpr in self.aggExprs:
                            intermediate_results.append(aggExpr[0])
                    index = 0
                    # Perform the aggregation function
                    for aggExpr in self.aggExprs:
                        intermediate_result = intermediate_results[index]
                        intermediate_results[index] = aggExpr[1](
                            intermediate_result, tuple_Unpacked)
                        index += 1

            for val, intermediate_results in aggregator.items():
                index = 0
                for aggExpr in self.aggExprs:
                    intermediate_result = intermediate_results[index]
                    intermediate_results[index] = aggExpr[2](
                        intermediate_result)
                    index += 1

                outputList = itertools.chain([val], intermediate_results)
                outputTuple = self.outputSchema.instantiate(*outputList)
                self.emitOutputTuple(self.outputSchema.pack(outputTuple))

            if self.outputPages:
                self.outputPages = [self.outputPages[-1]]

        # remove the temporary relation created
        for _, title in Map.items():
            self.storage.removeRelation(title)

        return self.storage.pages(self.relationId())

    def partition(self, relMap):
        for (pageId, page) in iter(self.subPlan):
            for Tuple in page:
                tuple_Unpacked = self.subSchema.unpack(Tuple)
                key = self.groupExpr(tuple_Unpacked)
                if type(key) is tuple:
                    key = key
                else:
                    key = key,

                value = self.groupHashFn(key)
                # if this key is not in relation map, we should create a temperory file to contain these tuples with this key
                if not value in relMap:
                    title = str(self.id()) + "_grp_" + str(value)
                    self.storage.createRelation(title, self.subSchema)
                    relMap[value] = title
                self.storage.insertTuple(relMap[value], Tuple)

    # Plan and statistics information

    # Returns a single line description of the operator.
    def explain(self):
        return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                                 + ", aggSchema=" + self.aggSchema.toString() + ")"
Esempio n. 36
0
 def initializeSchema(self):
     schema = self.operatorType() + str(self.id())
     fields = self.groupSchema.schema() + self.aggSchema.schema()
     self.outputSchema = DBSchema(schema, fields)
Esempio n. 37
0
class GroupBy(Operator):
  def __init__(self, subPlan, **kwargs):
    super().__init__(**kwargs)

    if self.pipelined:
      raise ValueError("Pipelined group-by-aggregate operator not supported")

    self.subPlan     = subPlan
    self.subSchema   = subPlan.schema()
    self.groupSchema = kwargs.get("groupSchema", None)
    self.aggSchema   = kwargs.get("aggSchema", None)
    self.groupExpr   = kwargs.get("groupExpr", None)
    self.aggExprs    = kwargs.get("aggExprs", None)
    self.groupHashFn = kwargs.get("groupHashFn", None)

    self.validateGroupBy()
    self.initializeSchema()

  # Perform some basic checking on the group-by operator's parameters.
  def validateGroupBy(self):
    requireAllValid = [self.subPlan, \
                       self.groupSchema, self.aggSchema, \
                       self.groupExpr, self.aggExprs, self.groupHashFn ]

    if any(map(lambda x: x is None, requireAllValid)):
      raise ValueError("Incomplete group-by specification, missing a required parameter")

    if not self.aggExprs:
      raise ValueError("Group-by needs at least one aggregate expression")

    if len(self.aggExprs) != len(self.aggSchema.fields):
      raise ValueError("Invalid aggregate fields: schema mismatch")

  # Initializes the group-by's schema as a concatenation of the group-by
  # fields and all aggregate fields.
  def initializeSchema(self):
    schema = self.operatorType() + str(self.id())
    fields = self.groupSchema.schema() + self.aggSchema.schema()
    self.outputSchema = DBSchema(schema, fields)

  # Returns the output schema of this operator
  def schema(self):
    return self.outputSchema

  # Returns any input schemas for the operator if present
  def inputSchemas(self):
    return [self.subPlan.schema()]

  # Returns a string describing the operator type
  def operatorType(self):
    return "GroupBy"

  # Returns child operators if present
  def inputs(self):
    return [self.subPlan]

  # Iterator abstraction for selection operator.
  def __iter__(self):
    self.initializeOutput()
    self.outputIterator = self.processAllPages()
    return self

  def __next__(self):
    return next(self.outputIterator)

  # Page-at-a-time operator processing
  def processInputPage(self, pageId, page):
    raise ValueError("Page-at-a-time processing not supported for joins")

  # Set-at-a-time operator processing
  def processAllPages(self):
    relIdMap = {}

    # Perform partition using hash function
    self.partition(relIdMap)

    # Perform group-by operation
    for hashValue, relId in relIdMap.items():
      pageIter = self.storage.pages(relId)

      aggregationResults = {} # Stores intermediate aggregation results

      for _, page in pageIter:
        for tupleP in page:
          tupleU = self.subSchema.unpack(tupleP)
          gbVal = self.getGroupByValue(tupleU)

          # Get intermediate results for this group-by value
          intermediateResults = aggregationResults.get(gbVal, None)

          if intermediateResults is None:
            intermediateResults = list()
            aggregationResults[gbVal] = intermediateResults
            for aggExpr in self.aggExprs:
              # Form a list of initial values
              intermediateResults.append(aggExpr[0])
          
          idx = 0
          for aggExpr in self.aggExprs:
            # Perform aggregation by applying the lambda function (aggExpr[1])
            intermediateResult = intermediateResults[idx]
            intermediateResults[idx] = aggExpr[1](intermediateResult, tupleU)
            idx = idx + 1

      for gbVal, intermediateResults in aggregationResults.items():
        idx = 0
        for aggExpr in self.aggExprs:
          # Perform final step by applying the lambda function (aggExpr[2])
          intermediateResult = intermediateResults[idx]
          intermediateResults[idx] = aggExpr[2](intermediateResult)
          idx = idx + 1

        outputList = itertools.chain([gbVal[0]], intermediateResults)
        outputTuple = self.outputSchema.instantiate(*outputList)
        self.emitOutputTuple(self.outputSchema.pack(outputTuple))

      # No need to track anything but the last output page when in batch mode.
      if self.outputPages:
        self.outputPages = [self.outputPages[-1]]

    # Remove partitions
    for _, relId in relIdMap.items():
      self.storage.removeRelation(relId)

    return self.storage.pages(self.relationId())

  # Partitions a given relation based on some hash function 
  def partition(self, relIdMap):
    for (pageId, page) in iter(self.subPlan):
      for tupleP in page:
        # Compute hash value for every tuple
        tupleU = self.subSchema.unpack(tupleP)
        hashVal = self.groupHashFn(self.getGroupByValue(tupleU))

        # Store in temporary buckets (files)
        if not hashVal in relIdMap:
          relId = str(self.id()) + "_grp_" + str(hashVal)
          self.storage.createRelation(relId, self.subSchema)
          relIdMap[hashVal] = relId

        self.storage.insertTuple(relIdMap[hashVal], tupleP)

  def getGroupByValue(self, unpackedTuple):
    gbVal = self.groupExpr(unpackedTuple)
    return gbVal if type(gbVal) is tuple else gbVal,

  # Plan and statistics information

  # Returns a single line description of the operator.
  def explain(self):
    return super().explain() + "(groupSchema=" + self.groupSchema.toString() \
                             + ", aggSchema=" + self.aggSchema.toString() + ")"
Esempio n. 38
0
def tests(printOutput):
	db = setup()
	'''select
			sum(l_extendedprice * l_discount) as revenue
		from
			lineitem
		where
			l_shipdate >= 19940101
			and l_shipdate < 19950101
			and l_discount between 0.06 - 0.01 and 0.06 + 0.01
			and l_quantity < 24'''
	query1 = db.query().fromTable('lineitem') \
				.where('L_SHIPDATE >= 19940101') \
		.where('L_SHIPDATE < 19950101') \
		.where('L_DISCOUNT > 0.06 - 0.01') \
		.where('L_DISCOUNT < 0.06 + 0.01') \
		.where('L_QUANTITY < 24') \
		.groupBy(groupSchema=DBSchema('REV', [('dum', 'int')]),
								 aggSchema=DBSchema('SUM', [('SUM', 'double')]),
								 groupExpr=(lambda e: (1)),
								 aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * e.L_DISCOUNT, lambda x: x)],
								 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
		.select({'REVENUE': ('SUM', 'double')}) \
		.finalize()
		
	'''select
				sum(l_extendedprice * (1 - l_discount)) as promo_revenue
	from
				lineitem,
				part
	where
				l_partkey = p_partkey
				and l_shipdate >= 19950901
				and l_shipdate < 19951001'''
	query2 = db.query().fromTable('lineitem') \
		.join(db.query().fromTable('part'),
							rhsSchema=db.relationSchema('part'),
							method='block-nested-loops',
							expr='L_PARTKEY == P_PARTKEY') \
		.where('L_SHIPDATE >= 19950901') \
		.where('L_SHIPDATE < 19951001') \
		.groupBy(groupSchema=DBSchema('REV', [('dum', 'int')]),
								 aggSchema=DBSchema('SUM', [('SUM', 'double')]),
								 groupExpr=(lambda e: (1)),
								 aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)],
								 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
		.select({'PROMO_REVENUE': ('SUM', 'double')}) \
		.finalize()
		
	'''select
				l_orderkey,
				sum(l_extendedprice * (1 - l_discount)) as revenue,
				o_orderdate,
				o_shippriority
	from
				customer,
				orders,
				lineitem
	where
				c_mktsegment = 'BUILDING'
				and c_custkey = o_custkey
				and l_orderkey = o_orderkey
				and o_orderdate < 19950315
				and l_shipdate > 19950315
	group by
		l_orderkey,
				o_orderdate,
				o_shippriority'''
	query3 = db.query().fromTable('customer') \
		.join(db.query().fromTable('orders'),
							rhsSchema=db.relationSchema('orders'),
							method='block-nested-loops',
							expr='C_CUSTKEY == O_CUSTKEY') \
		.join(db.query().fromTable('lineitem'),
							rhsSchema=db.relationSchema('lineitem'),
							method='block-nested-loops',
							expr='L_ORDERKEY == O_ORDERKEY') \
		.where('L_SHIPDATE == \'BUILDING\'') \
		.where('O_ORDERDATE < 19950315') \
		.where('L_SHIPDATE > 19950315') \
		.groupBy(groupSchema=DBSchema('FIRST', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'char(10)'), ('O_SHIPPRIORITY', 'int')]),
								 aggSchema=DBSchema('SUM', [('SUM', 'double')]),
								 groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)),
								 aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)],
								 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
		.select({'L_ORDERKEY': ('L_ORDERKEY', 'int'), 'REVENUE': ('SUM', 'double'),'O_ORDERDATE': ('O_ORDERDATE', 'char(10)'),'O_SHIPPRIORITY': ('O_SHIPPRIORITY', 'int')}) \
		.finalize()
		
	'''select
				c_custkey,
				c_name,
				sum(l_extendedprice * (1 - l_discount)) as revenue,
				c_acctbal,
				n_name,
				c_address,
				c_phone,
				c_comment
	from
				customer,
				orders,
				lineitem,
				nation
	where
				c_custkey = o_custkey
				and l_orderkey = o_orderkey
				and o_orderdate >= 19931001
				and o_orderdate < 19940101
				and l_returnflag = 'R'
				and c_nationkey = n_nationkey
	group by
				c_custkey,
				c_name,
				c_acctbal,
				c_phone,
				n_name,
				c_address,
				c_comment'''
	query4 = db.query().fromTable('customer') \
		.join(db.query().fromTable('orders'),
							rhsSchema=db.relationSchema('orders'),
							method='block-nested-loops',
							expr='C_CUSTKEY == O_CUSTKEY') \
		.join(db.query().fromTable('lineitem'),
							rhsSchema=db.relationSchema('lineitem'),
							method='block-nested-loops',
							expr='L_ORDERKEY == O_ORDERKEY') \
		.where('L_SHIPDATE == \'BUILDING\'') \
		.where('O_ORDERDATE >= 19931001') \
		.where('O_ORDERDATE < 19940101') \
		.where('L_RETURNFLAG == \'R\'') \
		.join(db.query().fromTable('nation'),
							rhsSchema=db.relationSchema('nation'),
							method='block-nested-loops',
							expr='C_NATIONKEY == N_NATIONKEY') \
		.groupBy(groupSchema=DBSchema('FIRST', [('C_CUSTKEY', 'int'), ('C_NAME', 'char(25)'), ('C_ACCTBAL', 'double'), ('C_PHONE', 'char(15)'), ('N_NAME', 'char(25)'),('C_ADDRESS', 'char(40)'),('C_COMMENT', 'char(117)')]),
								 aggSchema=DBSchema('SUM', [('SUM', 'double')]),
								 groupExpr=(lambda e: (e.C_CUSTKEY, e.C_NAME, e.C_ACCTBAL, e.C_PHONE, e.N_NAME, e.C_ADDRESS, e.C_PHONE, e.C_COMMENT)),
								 aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)],
								 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
		.select({'C_CUSTKEY' : ('C_CUSTKEY', 'int'), 'C_NAME' : ('C_NAME', 'char(25)'),'SUM' : ('SUM', 'double'), 'C_ACCTBAL' : ('C_ACCTBAL', 'double'), 'C_PHONE' : ('C_PHONE', 'char(15)'), 'N_NAME' : ('N_NAME', 'char(25)'),'C_ADDRESS' : ('C_ADDRESS', 'char(40)'),'C_COMMENT' : ('C_COMMENT', 'char(117)')}) \
		.finalize()
		
	'''select
				n_name,
				sum(l_extendedprice * (1 - l_discount)) as revenue
	from
				customer,
				orders,
				lineitem,
				supplier,
				nation,
				region
	where
				c_custkey = o_custkey
				and l_orderkey = o_orderkey
				and l_suppkey = s_suppkey
				and c_nationkey = s_nationkey
				and s_nationkey = n_nationkey
				and n_regionkey = r_regionkey
				and r_name = 'ASIA'
				and o_orderdate >= 19940101
				and o_orderdate < 19950101
	group by
				n_name'''
	query5 = db.query().fromTable('customer') \
		.join(db.query().fromTable('orders'),
							rhsSchema=db.relationSchema('orders'),
							method='block-nested-loops',
							expr='C_CUSTKEY == O_CUSTKEY') \
		.join(db.query().fromTable('lineitem'),
							rhsSchema=db.relationSchema('lineitem'),
							method='block-nested-loops',
							expr='L_ORDERKEY == O_ORDERKEY') \
		.join(db.query().fromTable('supplier'),
							rhsSchema=db.relationSchema('supplier'),
							method='block-nested-loops',
							expr='L_SUPPKEY == S_SUPPKEY and C_NATIONKEY == S_NATIONKEY') \
		.join(db.query().fromTable('nation'),
							rhsSchema=db.relationSchema('nation'),
							method='block-nested-loops',
							expr='S_NATIONKEY == N_NATIONKEY') \
		.join(db.query().fromTable('region'),
							rhsSchema=db.relationSchema('region'),
							method='block-nested-loops',
							expr='N_REGIONKEY == R_REGIONKEY') \
		.where('L_SHIPDATE == \'BUILDING\'') \
		.where('R_NAME == \'ASIA\'') \
		.where('O_ORDERDATE >= 19940101') \
		.where('O_ORDERDATE < 19950101') \
		.groupBy(groupSchema=DBSchema('FIRST', [('N_NAME', 'char(25)')]),
								 aggSchema=DBSchema('SUM', [('SUM', 'double')]),
								 groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)),
								 aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)],
								 groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \
		.select({'N_NAME': ('N_NAME', 'char(25)'), 'SUM' : ('SUM', 'double')}) \
		.finalize()
	

	queries = [query1, query2, query3, query4, query5]
	names = ['query1', 'query2', 'query3', 'query4', 'query5']
	opt = Optimizer.Optimizer(db)
	for i,q in enumerate(queries):
		querytest(db, printOutput, q, names[i], opt)
Esempio n. 39
0
from Storage.Page import Page
from Storage.SlottedPage import SlottedPage
from Storage.File import StorageFile
from Storage.FileManager import FileManager
from Storage.BufferPool import BufferPool
from Catalog.Identifiers import FileId, PageId, TupleId
from Catalog.Schema import DBSchema

import sys
import unittest

# A schema to work with
schema = DBSchema('employee', [('id', 'int'), ('age', 'int')])

# Initialize a bufferpool and filemanager
def makeDB():
  bp = BufferPool()
  fm = FileManager(bufferPool=bp)
  bp.setFileManager(fm)
  return (bp, fm)
 
# Make an employee 
def makeEmployee(n):
  return schema.instantiate(n, 25 + n)

if __name__ == "__main__":
  print("Creating")
  (bp, fm) = makeDB()
  fm.createRelation(schema.name, schema)
  (fId, f) = fm.relationFile(schema.name)