def query2Hash_test(db, printOutput): query2 = db.query().fromTable('part') \ .join(db.query().fromTable('lineitem').where('L_RETURNFLAG == \'R\''), rhsSchema=db.relationSchema('lineitem'), method='hash', lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]), rhsHashFn='hash(L_PARTKEY) % 111', rhsKeySchema=DBSchema('L_PARTKEY', [('L_PARTKEY', 'int')])) \ .groupBy(groupSchema=DBSchema('P_NAME', [('P_NAME', 'char(55)')]), aggSchema=DBSchema('COUNT', [('COUNT', 'int')]), groupExpr=(lambda e: e.P_NAME), aggExprs=[(0, lambda acc, e: acc + 1, lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .finalize() start = time.time() processedQuery = [ query2.schema().unpack(tup) for page in db.processQuery(query2) for tup in page[1] ] end = time.time() duration = end - start print('Query: Query 2, Hash') if printOutput: print('Results: ', processedQuery) print('Time', duration) return
def query1(db): query = db.query().fromTable('lineitem')\ .where( "(L_SHIPDATE >= 19940101) and (L_SHIPDATE < 19950101) and (0.06-0.01 <= L_DISCOUNT <= 0.06 + 0.01) and (L_QUANTITY < 24")\ .groupBy( groupSchema=DBSchema('groupKey', [('ONE', 'int')]), groupExpr=(lambda e: 1), aggSchema=DBSchema('groupBy', [('revenue', 'float')]), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * e.L_DISCOUNT), lambda x: x)], \ groupHashFn=(lambda gbVal: hash(gbVal) % 1))\ .select( {'revenue': ('revenue', 'float')})\ .finalize() return query
def testHashJoin(self): schema = self.db.relationSchema('employee') e2schema = schema.rename('employee2', {'id': 'id2', 'age': 'age2', 'dept_id': 'dept_id2'}) keySchema = DBSchema('employeeKey', [('id', 'int')]) keySchema2 = DBSchema('employeeKey2', [('id2', 'int')]) hashJoin = self.db.query().fromTable('employee').join( \ self.db.query().fromTable('employee'), \ rhsSchema=e2schema, \ method='hash', \ lhsHashFn='hash(id) % 4', lhsKeySchema=keySchema, \ rhsHashFn='hash(id2) % 4', rhsKeySchema=keySchema2, \ ).finalize() results = self.getResults(hashJoin) self.assertEqual(len(results), self.numEmployees)
def validateSchema(self): if self.lhsPlan.schema().match(self.rhsPlan.schema()): schemaName = self.operatorType() + str(self.id()) schemaFields = self.lhsPlan.schema().schema() self.unionSchema = DBSchema(schemaName, schemaFields) else: raise ValueError("Union operator type error, mismatched input schemas")
def unpack(cls, buffer): brepr = cls.binrepr(buffer) values = brepr.unpack_from(buffer) if len(values) == 7: pageClass = pickle.loads(values[5]) schema = DBSchema.unpackSchema(values[6]) return FileHeader(numTuples=values[1], pageSize=values[2], pageClass=pageClass, schema=schema)
def unpack(cls, buffer): brepr = cls.binrepr(buffer) values = brepr.unpack_from(buffer) if len(values) == 6: pageClass = pickle.loads(values[4]) schema = DBSchema.unpackSchema(values[5]) return FileHeader(pageSize=values[1], pageClass=pageClass, schema=schema)
def createRelation(self, relationName, relationFields): if relationName not in self.relationMap: schema = DBSchema(relationName, relationFields) self.relationMap[relationName] = schema self.storage.createRelation(relationName, schema) self.checkpoint() else: raise ValueError("Relation '" + relationName + "' already exists")
def query2(db): query = db.query().fromTable('part')\ .join( db.query().fromTable('lineitem').where("(L_SHIPDATE >= 19950901) and (L_SHIPDATE < 19951001)"), method='hash', lhsHashFn='hash(P_PARTKEY) % 7', lhsKeySchema=DBSchema('partkey2',[('P_PARTKEY', 'int')]), rhsHashFn='hash(L_PARTKEY) % 7', rhsKeySchema=DBSchema('partkey1', [('L_PARTKEY', 'int')])) \ .groupBy( groupSchema=DBSchema('groupKey', [('ONE', 'int')]), aggSchema=DBSchema('groupBy', [('promo_revenue', 'float')]), groupExpr=(lambda e: 1), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 1))\ .select( {'promo_revenue' : ('promo_revenue', 'float')}).finalize() return query
def query3bBNL_test(db, printOutput): query3 = db.query().fromTable('part') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_PARTKEY == P_PARTKEY') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .join(db.query().fromTable('customer'), rhsSchema=db.relationSchema('customer'), method='block-nested-loops', expr='O_CUSTKEY == C_CUSTKEY') \ .join(db.query().fromTable('nation'), rhsSchema=db.relationSchema('nation'), method='block-nested-loops', expr='C_NATIONKEY == N_NATIONKEY') \ .groupBy(groupSchema=DBSchema('FIRST', [('N_NAME', 'char(25)'), ('P_NAME', 'char(55)')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.N_NAME, e.P_NAME)), aggExprs=[(0, lambda acc, e: acc + e.L_QUANTITY, lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .groupBy(groupSchema=DBSchema('SECOND', [('N_NAME', 'char(25)')]), aggSchema=DBSchema('MAX', [('MAX', 'double')]), groupExpr=(lambda e: e.N_NAME), aggExprs=[(0, lambda acc, e: max(acc, e.SUM), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'NATION': ('N_NAME', 'char(25)'), 'MAX': ('MAX', 'double')}) \ .finalize() start = time.time() processedQuery = [ query3.schema().unpack(tup) for page in db.processQuery(query3) for tup in page[1] ] end = time.time() duration = end - start print('Query: Query 3b (Order Reversed), BNL') if printOutput: print('Results: ', processedQuery) print('Time', duration) return
def testGroupBy(self): # SELECT id, min(age), max(age) FROM Employee GROUP BY (id % 2) aggMinMaxSchema = DBSchema('minmax', [('minAge', 'int'), ('maxAge', 'int')]) keySchema = DBSchema('employeeKey', [('id', 'int')]) groupBy = self.db.query().fromTable('employee').groupBy( \ groupSchema=keySchema, \ aggSchema=aggMinMaxSchema, \ groupExpr=(lambda e: e.id % 2), \ aggExprs=[(sys.maxsize, lambda acc, e: min(acc, e.age), lambda x: x), \ (0, lambda acc, e: max(acc, e.age), lambda x: x)], \ groupHashFn=(lambda gbVal: hash(gbVal[0]) % 2) \ ).finalize() results = self.getResults(groupBy) self.assertEqual(len(results), 2) for result in results: if result.id == 0: self.assertEqual(result.minAge, 20) self.assertEqual(result.maxAge, 56) else: self.assertEqual(result.minAge, 22) self.assertEqual(result.maxAge, 58)
def query3(db): query = db.query().fromTable('customer')\ .join( db.query().fromTable('orders'), method='hash', lhsHashFn='hash(C_CUSTKEY) % 5', lhsKeySchema=DBSchema('customerKey1', [('C_CUSTKEY', 'int')]), rhsHashFn='hash(O_CUSTKEY) % 5', rhsKeySchema=DBSchema('customerKey2', [('O_CUSTKEY', 'int')]))\ .join( db.query().fromTable('lineitem'), method='hash', lhsHashFn='hash(O_ORDERKEY) % 5', lhsKeySchema=DBSchema('orderKey1', [('O_ORDERKEY', 'int')]), rhsHashFn='hash(L_ORDERKEY) % 5', rhsKeySchema=DBSchema('orderkey2', [('L_ORDERKEY', 'int')]))\ .where( "C_MKTSEGMENT == 'BUILDING' and O_ORDERDATE < 19950315 and L_SHIPDATE > 19950315")\ .groupBy( groupSchema=DBSchema('groupKey', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'int'), ('O_SHIPPRIORITY', 'int')]), aggSchema=DBSchema('groupAgg', [('revenue', 'float')]), groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 10))\ .select( {'l_orderkey': ('L_ORDERKEY', 'int'), 'revenue': ('revenue', 'float'), 'o_orderdate': ('O_ORDERDATE', 'int'), 'o_shippriority': ('O_SHIPPRIORITY', 'int')})\ .finalize() return query
def processAllPages(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() outputSchema = DBSchema(schema, fields) relIds = [] for (pageId, page) in iter(self.subPlan): for tpl in page: group = self.groupExpr(self.subSchema.unpack(tpl)) key = self.groupHashFn((group, None)) relId = str(self.id) + "u" + str(key) self.storage.createRelation(relId, self.subSchema) self.storage.insertTuple(relId, tpl) if relId not in relIds: relIds.append(relId) for rid in relIds: groupDict = {} for (pageId, page) in self.storage.pages(rid): for tpl in page: groupKey = self.groupExpr(self.subSchema.unpack(tpl)) if groupKey not in groupDict: groupDict[groupKey] = [] for trio in self.aggExprs: groupDict[groupKey].append(trio[0]) for i in range(len(self.aggExprs)): groupDict[groupKey][i] = self.aggExprs[i][1]( groupDict[groupKey][i], self.subSchema.unpack(tpl)) for key in groupDict: for i in range(len(self.aggExprs)): groupDict[key][i] = self.aggExprs[i][2](groupDict[key][i]) for key in groupDict: outTuple = outputSchema.instantiate( key, *[f for f in groupDict[key]]) self.emitOutputTuple(self.outputSchema.pack(outTuple)) return self.storage.pages(self.relationId())
def testIndexJoin(self): schema = self.db.relationSchema('employee') e2schema = schema.rename('employee2', {'id': 'id2', 'age': 'age2', 'dept_id': 'dept_id2'}) lhsKeySchema = DBSchema('employeeAge', [('age2', 'int')]) join = self.db.query().fromTable('employee').join( \ self.db.query().fromTable('employee'), \ lhsSchema=e2schema, \ lhsKeySchema=lhsKeySchema, \ method='indexed', \ indexId=self.idxId, \ expr='True' ).finalize() results = self.getResults(join) self.assertEqual(len(results), self.numEmployees)
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) # Start with a fresh 'employee' table self.db = Database.Database() self.db.createRelation('employee', [('id', 'int'), ('age', 'int'), ('dept_id', 'int')]) # Populate employees with department 0 or 1 self.numEmployees = 20 empSchema = self.db.relationSchema('employee') # Create an index over employee age keySchema = DBSchema('employeeAge', [('age', 'int')]) self.idxId = self.db.storageEngine().createIndex('employee', empSchema, keySchema, False) for tup in [empSchema.pack(empSchema.instantiate(i, 2 * i + 20, i % 2)) for i in range(self.numEmployees)]: self.db.insertTuple(empSchema.name, tup)
def query1Hash_test(db, printOutput): query1 = db.query().fromTable('part') \ .join(db.query().fromTable('partsupp').where('PS_AVAILQTY == 1'), rhsSchema=db.relationSchema('partsupp'), method='hash', lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]), rhsHashFn='hash(PS_PARTKEY) % 111', rhsKeySchema=DBSchema('PS_PARTKEY', [('PS_PARTKEY', 'int')])) \ .join(db.query().fromTable('supplier'), rhsSchema=db.relationSchema('supplier'), method='hash', lhsHashFn='hash(PS_SUPPKEY) % 111', lhsKeySchema=DBSchema('PS_SUPPKEY', [('PS_SUPPKEY', 'int')]), rhsHashFn='hash(S_SUPPKEY) % 111', rhsKeySchema=DBSchema('S_SUPPKEY', [('S_SUPPKEY', 'int')])) \ .union( db.query().fromTable('part') .join(db.query().fromTable('partsupp').where('PS_SUPPLYCOST < 5'), rhsSchema=db.relationSchema('partsupp'), method='hash', lhsHashFn='hash(P_PARTKEY) % 111', lhsKeySchema=DBSchema('P_PARTKEY', [('P_PARTKEY', 'int')]), rhsHashFn='hash(PS_PARTKEY) % 111', rhsKeySchema=DBSchema('PS_PARTKEY', [('PS_PARTKEY', 'int')])) \ .join(db.query().fromTable('supplier'), rhsSchema=db.relationSchema('supplier'), method='hash', lhsHashFn='hash(PS_SUPPKEY) % 111', lhsKeySchema=DBSchema('PS_SUPPKEY', [('PS_SUPPKEY', 'int')]), rhsHashFn='hash(S_SUPPKEY) % 111', rhsKeySchema=DBSchema('S_SUPPKEY', [('S_SUPPKEY', 'int')]))) \ .select({'P_NAME': ('P_NAME', 'char(55)'), 'S_NAME': ('S_NAME', 'char(25)')}) \ .finalize() start = time.time() processedQuery = [ query1.schema().unpack(tup) for page in db.processQuery(query1) for tup in page[1] ] end = time.time() duration = end - start print('Query: Query 1, Hash') if printOutput: print('Results: ', processedQuery) print('Time', duration) return
def query4(db): query = db.query().fromTable('nation')\ .join( db.query().fromTable('customer'), method='hash', lhsHashFn='hash(N_NATIONKEY) % 11', lhsKeySchema=DBSchema('nationKey1', [('N_NATIONKEY', 'int')]), rhsHashFn='hash(C_NATIONKEY) % 11', rhsKeySchema=DBSchema('nationKey2', [('C_NATIONKEY', 'int')]))\ .join( db.query().fromTable('orders'), method='hash', lhsHashFn='hash(C_CUSTKEY) % 11', lhsKeySchema=DBSchema('custKey1', [('C_CUSTKEY', 'int')]), rhsHashFn='hash(O_CUSTKEY) % 11', rhsKeySchema=DBSchema('custKey2', [('O_CUSTKEY', 'int')])) \ .join( db.query().fromTable('lineitem'), method='hash', lhsHashFn='hash(O_ORDERKEY) % 11', lhsKeySchema=DBSchema('orderKey1', [('O_ORDERKEY', 'int')]), rhsHashFn='hash(L_ORDERKEY) % 11', rhsKeySchema=DBSchema('orderKey2', [('L_ORDERKEY', 'int')])) \ .where( "O_ORDERDATE >= 19931001 and O_ORDERDATE < 19940101 and L_RETURNFLAG == 'R'")\ .groupBy( groupSchema=DBSchema('groupKey', [('C_CUSTKEY','int'),('C_NAME','char(25)'),('C_ACCTBAL','float'),('C_PHONE','char(15)'),('N_NAME','char(25)'),('C_ADDRESS','char(40)'),('C_COMMENT','char(117)')]), aggSchema=DBSchema('groupAgg', [('revenue', 'float')]), groupExpr=(lambda e: (e.C_CUSTKEY, e.C_NAME, e.C_ACCTBAL, e.C_PHONE, e.N_NAME, e.N_NAME, e.C_ADDRESS, e.C_COMMENT)), aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT)), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal) % 11))\ .select( {'c_custkey': ('C_CUSTKEY', 'int'), 'c_name': ('C_NAME', 'char(25)'), 'revenue': ('revenue', 'float'), 'c_acctbal': ('C_ACCTBAL', 'float'), 'n_name': ('N_NAME', 'char(25)'), 'c_address': ('C_ADDRESS', 'char(40)'), 'c_phone': ('C_PHONE', 'char(15)'), 'c_comment': ('C_COMMENT', 'char(117)')})\ .finalize() return query
def validate(self, operator, storage): if operator.operatorType() == "TableScan": return operator.schema() elif operator.operatorType() == "Select": operator.storage = storage return self.validate(operator.subPlan, storage) elif operator.operatorType() == "Project": operator.storage = storage self.validate(operator.subPlan, storage) return DBSchema( operator.relationId(), \ [(k, v[1]) for (k,v) in operator.projectExprs.items()]) elif operator.operatorType() == "GroupBy": self.validate(operator.subPlan, storage) operator.storage = storage return operator.schema() elif operator.operatorType() == "UnionAll": operator.storage = storage return self.validate(operator.subPlan, storage) else: operator.lhsSchema = self.validate(operator.lhsPlan, storage) operator.rhsSchema = self.validate(operator.rhsPlan, storage) operator.initializeSchema() operator.storage = storage return operator.schema()
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): raise NotImplementedError def __next__(self): raise NotImplementedError # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): raise NotImplementedError def blockNestedLoops(self): raise NotImplementedError ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): raise NotImplementedError # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
part = db.query().fromTable('part').select({ 'P_NAME': ('P_NAME', 'char(55)'), 'P_PARTKEY': ('P_PARTKEY', 'int') }) partsupp = db.query().fromTable('partsupp').where('PS_AVAILQTY == 1').select({ 'PS_PARTKEY': ('PS_PARTKEY', 'int'), 'PS_SUPPKEY': ('PS_SUPPKEY', 'int') }) supplier = db.query().fromTable('supplier').select({ 'S_NAME': ('S_NAME', 'char(25)'), 'S_SUPPKEY': ('S_SUPPKEY', 'int') }) join_ps_p = partsupp.join(\ part, \ rhsSchema = DBSchema('part', [('P_NAME','char(55)'), ('P_PARTKEY','int')]), \ method = 'block-nested-loops', \ expr = 'PS_PARTKEY == P_PARTKEY') join_three = join_ps_p.join(\ supplier, \ rhsSchema = DBSchema('supplier', [('S_NAME','char(25)'), ('S_SUPPKEY', 'int')]), \ method = 'block-nested-loops', expr = 'PS_SUPPKEY == S_SUPPKEY' ).select({'P_NAME':('P_NAME','char(55)'), 'S_NAME':('S_NAME','char(25)')}) partsupp2 = db.query().fromTable('partsupp').where('PS_SUPPLYCOST < 5').select( { 'PS_PARTKEY': ('PS_PARTKEY', 'int'), 'PS_SUPPKEY': ('PS_SUPPKEY', 'int') })
exp 1 TPC-H Query 6: a 4-chain filter and aggregate query select sum(l_extendedprice * l_discount) as revenue from lineitem where l_shipdate >= 19940101 and l_shipdate < 19950101 and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24 ''' groupKeySchema = DBSchema('groupKey', [('ONE', 'int')]) groupAggSchema = DBSchema('groupBy', [('revenue', 'float')]) query1 = db.query().fromTable('lineitem').where( \ "(L_SHIPDATE >= 19940101) and (L_SHIPDATE < 19950101) and \ (0.06 - 0.01 <= L_DISCOUNT <= 0.06 + 0.01) and (L_QUANTITY < 24)" ).groupBy( \ groupSchema=groupKeySchema, \ aggSchema=groupAggSchema, \ groupExpr=(lambda e: 1), \ aggExprs=[(0, lambda acc, e: acc + (e.L_EXTENDEDPRICE * e.L_DISCOUNT), lambda x: x)], \ groupHashFn=(lambda gbVal: hash(gbVal) % 1)).select( \ {'revenue' : ('revenue', 'float')}).finalize() ''' exp 2 TPC-H Query 14: a 2-way join and aggregate query
schema1.pack( schema1.instantiate(i, random.randint(0, 50), 'This is a testing tuple.')) for i in range(0, 10000) ]: _ = db.insertTuple(schema1.name, tup) for tup in [ schema2.pack( schema2.instantiate(i, random.randint(0, 50), 'This is a testing tuple.')) for i in range(0, 10000) ]: _ = db.insertTuple(schema2.name, tup) keySchema = DBSchema('employeeKey', [('id', 'int')]) keySchema2 = DBSchema('employeeKey2', [('id2', 'int')]) query5 = db.query().fromTable('employeeeee').join( \ db.query().fromTable('employeeeee2'), \ rhsSchema=schema2, \ method='hash', \ lhsHashFn=(lambda e : e.id % 4), lhsKeySchema=keySchema, \ rhsHashFn=(lambda e : e.id2 % 4), rhsKeySchema=keySchema2, \ ).finalize() print(query5.explain()) def readResult(): for page in db.processQuery(query5):
def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields)
def checkJoins(self, joins, allAttributes, cachedLHS, cachedRHS): for join in joins: if join.joinMethod == 'hash': lhsCheck = join.lhsKeySchema.fields # This is a list rhsCheck = join.rhsKeySchema.fields # This is a list joinAttr = lhsCheck + rhsCheck contains = all(x in allAttributes for x in joinAttr) and any( y in cachedRHS.schema().fields for y in joinAttr) # print(lhsCheck, rhsCheck, contains) if contains: keySchema1 = DBSchema('keyschema1', [(lhsCheck[0], 'int')]) keySchema2 = DBSchema('keyschema2', [(rhsCheck[0], 'int')]) hashFn1 = 'hash(' + lhsCheck[0] + ') % 7' hashFn2 = 'hash(' + rhsCheck[0] + ') % 7' joinExpr = lhsCheck[0] + ' == ' + rhsCheck[0] if lhsCheck[0] in cachedLHS.schema().fields: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn1, 'lhsKeySchema': keySchema1, 'rhsHashFn': hashFn2, 'rhsKeySchema': keySchema2, 'joinExpr': joinExpr } else: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn2, 'lhsKeySchema': keySchema2, 'rhsHashFn': hashFn1, 'rhsKeySchema': keySchema1, 'joinExpr': joinExpr } elif join.joinExpr: joinAttr = ExpressionInfo(join.joinExpr).getAttributes() contains = all(x in allAttributes for x in joinAttr) if contains: joinAttr1 = joinAttr.pop() joinAttr2 = joinAttr.pop() try: joinAttr.pop() except KeyError: pass # print('There are only two attributes in this, as expected') keySchema1 = DBSchema('keySchema1', [([joinAttr1, 'int'])]) keySchema2 = DBSchema('keySchema1', [([joinAttr2, 'int'])]) hashFn1 = 'hash(' + joinAttr1 + ') % 7' hashFn2 = 'hash(' + joinAttr2 + ') % 7' joinExpr = join.joinExpr if joinAttr1 in cachedLHS.schema().fields: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn1, 'lhsKeySchema': keySchema1, 'rhsHashFn': hashFn2, 'rhsKeySchema': keySchema2, 'joinExpr': joinExpr } else: return True, { 'joinMethod': 'hash', 'lhsHashFn': hashFn2, 'lhsKeySchema': keySchema2, 'rhsHashFn': hashFn1, 'rhsKeySchema': keySchema1, 'joinExpr': joinExpr } else: return False, None
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) def localCost(self, estimated): tupleSizeLeft = self.lhsPlan.schema().size numTuplesLeft = self.lhsPlan.cardinality(estimated) tupleSizeRight = self.rhsPlan.schema().size numTuplesRight = self.rhsPlan.cardinality(estimated) pageSize = self.storage.bufferPool.pageSize numPagesLeft = (tupleSizeLeft * numTuplesLeft) // pageSize numPagesRight = (tupleSizeRight * numTuplesRight) // pageSize if self.joinMethod == "nested-loops": return (numTuplesLeft * self.tupleCost * numTuplesRight * self.tupleCost) + (numTuplesLeft * self.tupleCost) #return (numTuplesLeft * numPagesRight) + numPagesLeft elif self.joinMethod == "block-nested-loops": return (numTuplesLeft * self.tupleCost) + (((numTuplesLeft * self.tupleCost)// (self.storage.bufferPool.numPages() - 2)) * (numTuplesRight * self.tupleCost)) #return numPagesLeft + ((numPagesLeft // (self.storage.bufferPool.numPages() - 2)) * numPagesRight) elif self.joinMethod == "indexed": raise NotImplementedError elif self.joinMethod == "hash": return 3 * ((numTuplesLeft * self.tupleCost) + (numTuplesRight * self.tupleCost)) else: return None # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() self.partitionFiles = {0:{}, 1:{}} self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in self.lhsPlan: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in self.rhsPlan: for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): pageBlock = [] try: while True: (pageId, page) = next(pageIterator) pageBlock.append((pageId, page)) bufPool.pinPage(pageId) if bufPool.numFreePages() == 0: break except StopIteration: pass return pageBlock def blockNestedLoops(self): # Access the outer relation's block, pinning pages in the buffer pool. bufPool = self.storage.bufferPool lhsIter = iter(self.lhsPlan) lPageBlock = self.accessPageBlock(bufPool, lhsIter) while lPageBlock: for (lPageId, lhsPage) in lPageBlock: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in self.rhsPlan: for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Unpin the page after joining with the RHS relation. # Thus future accesses can evict the page while reading the next block. bufPool.unpinPage(lPageId) # Move to the next page block after processing it. lPageBlock = self.accessPageBlock(bufPool, lhsIter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): if self.storage.getIndex(self.indexId) is None: raise ValueError("Missing index in storage manager: %s" % self.indexId) if self.indexId: bufPool = self.storage.bufferPool for (lPageId, lhsPage) in self.lhsPlan: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) # Match against RHS tuples using the index. joinKey = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) matches = self.storage.fileMgr.lookupByIndex(self.rhsPlan.relationId(), self.indexId, joinKey) for rhsTupId in matches: rhsPage = bufPool.getPage(rhsTupId.pageId) rTuple = rhsPage.getTuple(rhsTupId) # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate any remaining join predicate, and output if we have a match. fullMatch = eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True if fullMatch: outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) else: raise ValueError("No index found while using an indexed nested loops join") ################################## # # Hash join implementation. # def hashJoin(self): # Partition the LHS and RHS inputs, creating a temporary file for each partition. # We assume one-level of partitioning is sufficient and skip recurring. for (lPageId, lPage) in self.lhsPlan: for lTuple in lPage: lPartEnv = self.loadSchema(self.lhsSchema, lTuple) lPartKey = eval(self.lhsHashFn, globals(), lPartEnv) self.emitPartitionTuple(lPartKey, lTuple, left=True) for (rPageId, rPage) in self.rhsPlan: for rTuple in rPage: rPartEnv = self.loadSchema(self.rhsSchema, rTuple) rPartKey = eval(self.rhsHashFn, globals(), rPartEnv) self.emitPartitionTuple(rPartKey, rTuple, left=False) # Iterate over partition pairs and output matches # evaluating the join expression as necessary. for ((lPageId, lPage), (rPageId, rPage)) in self.partitionPairs(): for lTuple in lPage: joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for rTuple in rPage: joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) output = \ ( self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) \ == self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) ) \ and ( eval(self.joinExpr, globals(), joinExprEnv) if self.joinExpr else True ) if output: outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Clean up partitions. self.removePartitionFiles() # Return an iterator to the output relation return self.storage.pages(self.relationId()) # Hash join helpers. def partitionRelationId(self, left, partitionId): return self.operatorType() + str(self.id()) + "_" \ + ("l" if left else "r") + "part_" + str(partitionId) def emitPartitionTuple(self, partitionId, partitionTuple, left=False): partRelId = self.partitionRelationId(left, partitionId) partSchema = self.lhsSchema if left else self.rhsSchema # Create a partition file as needed. if not self.storage.hasRelation(partRelId): self.storage.createRelation(partRelId, partSchema) self.partitionFiles[int(left)][partitionId] = partRelId partFile = self.storage.fileMgr.relationFile(partRelId)[1] if partFile: partFile.insertTuple(partitionTuple) # Return pairs of pages from matching partitions. def partitionPairs(self): lKeys = self.partitionFiles[0].keys() rKeys = self.partitionFiles[1].keys() matches = [(self.partitionFiles[0][partId], self.partitionFiles[1][partId]) \ for partId in lKeys if partId in rKeys] return PartitionIterator(matches, self.storage) # Delete all existing partition files. def removePartitionFiles(self): for lPartRelId in self.partitionFiles[0].values(): self.storage.removeRelation(lPartRelId) for rPartRelId in self.partitionFiles[1].values(): self.storage.removeRelation(rPartRelId) self.partitionFiles = {0:{}, 1:{}} # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get( "lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get( "rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) self.pidsInBlock = list() self.tempFileHashR = dict() self.outputPageHashR = dict() self.tempFileHashL = dict() self.outputPageHashL = dict() self.tempFile = None # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in [ "nested-loops", "block-nested-loops", "indexed", "hash" ]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete join specification, missing join operator parameter" ) # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError( "Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops': 'NL', 'block-nested-loops': 'BNL', 'indexed': 'Index', 'hash': 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() self.inputIteratorL = iter(self.lhsPlan) self.inputFinished = False if not self.pipelined: if self.joinMethod == 'hash': self.outputIterator = self.hashJoin() elif self.joinMethod == 'block-nested-loops': self.outputIterator = self.blockNestedLoops() elif self.joinMethod == 'nested-loops': self.outputIterator = self.nestedLoops() return self def __next__(self): self.inputIteratorR = iter(self.rhsPlan) if self.pipelined: while not (self.inputFinished or self.isOutputPageReady()): try: lPageId, lhsPage = next(self.inputIteratorL) for lTuple in lhsPage: compare(lTuple) if self.outputPages: self.outputPages = [self.outputPages[-1]] except StopIteration: self.inputFinished = True return self.outputPage() else: return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") ################################## # # Nested loops implementation # def nestedLoops(self): for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. #compare(lTuple) if self.outputPages: self.outputPages = [self.outputPages[-1]] # Return an iterator to the output relation return self.storage.pages(self.relationId()) def compare(lTuple): # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for (rPageId, rhsPage) in iter(self.rhsPlan): for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate( *[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): for pid in self.pidsInBlock: bufPool.unpinPage(pid) self.pidsInBlock = list() M = bufPool.freeSpace() for i in range(0, M - 2): try: (pid, page) = next(pageIterator) except: break #if pid is None: # break self.pidsInBlock.append(pid) bufPool.getPage(pid, pinned=True) bufPool.pinPage(pid) def blockNestedLoops(self): riter = iter(self.rhsPlan) buf = self.storage.bufferPool while riter.hasNext(): self.accessPageBlock(buf, riter) for (lPageId, lhsPage) in iter(self.lhsPlan): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) for pid in self.pidsInBlock: rhsPage = buf.getPage(pid, pinned=True) for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(self.joinExpr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate(*[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] self.accessPageBlock(buf, riter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) ################################## # # Indexed nested loops implementation # # TODO: test def indexedNestedLoops(self): raise NotImplementedError ################################## # # Hash join implementation. # def hashJoin(self): for (rPageId, rhsPage) in iter(self.rhsPlan): for tuple in rhsPage: val = self.loadSchema(self.rhsSchema, tuple) hash = eval(self.rhsHashFn, globals(), val) self.emitOutputTupleHash(tuple, hash, False) for (lPageId, lhsPage) in iter(self.lhsPlan): for tuple in lhsPage: val = self.loadSchema(self.lhsSchema, tuple) hash = eval(self.lhsHashFn, globals(), val) self.emitOutputTupleHash(tuple, hash, True) evalStr = '' for i, lt in enumerate(self.lhsKeySchema.schema()): rt = self.rhsKeySchema.schema()[i] evalStr += str(lt[0]) + '==' + str(rt[0]) if i != 0 and i != len(self.lhsKeySchema.schema()) - 1: evalStr += ' and ' if self.joinExpr is not None: evalStr += ' and ' + self.joinExpr for lk in self.outputPageHashL.keys(): for rk in self.outputPageHashR.keys(): riter = iter(self.outputPageHashR[rk]) buf = self.storage.bufferPool M = buf.freeSpace() - 2 size = len(self.outputPageHashR[rk]) while size > 0: self.accessPageBlock(buf, riter) size -= M for (lPageId, lhsPage) in iter(self.outputPageHashL[lk]): for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema( self.lhsSchema, lTuple) for pid in self.pidsInBlock: rhsPage = buf.getPage(pid, pinned=True) for rTuple in rhsPage: # Load the RHS tuple fields. joinExprEnv.update( self.loadSchema( self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. if eval(evalStr, globals(), joinExprEnv): outputTuple = self.joinSchema.instantiate( *[ joinExprEnv[f] for f in self.joinSchema.fields ]) self.emitOutputTuple( self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] self.accessPageBlock(buf, riter) # Return an iterator to the output relation return self.storage.pages(self.relationId()) def getRelId(self, hashVal, isLeft): tempstr = 'temp' if isLeft: tempstr = 'templ' return self.relationId() + tempstr + str(hashVal) def initializeOutputHash(self, hashVal, isLeft): relId = self.getRelId(hashVal, isLeft) if self.storage.hasRelation(relId): self.storage.removeRelation(relId) if isLeft: self.storage.createRelation(relId, self.lhsSchema) self.tempFileHashL[hashVal] = self.storage.fileMgr.relationFile( relId)[1] self.outputPageHashL[hashVal] = [] else: self.storage.createRelation(relId, self.rhsSchema) self.tempFileHashR[hashVal] = self.storage.fileMgr.relationFile( relId)[1] self.outputPageHashR[hashVal] = [] def emitOutputTupleHash(self, tupleData, hashVal, isLeft): if isLeft: if hashVal not in self.tempFileHashL.keys(): self.initializeOutputHash(hashVal, isLeft) else: if hashVal not in self.tempFileHashR.keys(): self.initializeOutputHash(hashVal, isLeft) self.currFile = self.tempFileHashR[hashVal] self.currOutputPages = self.outputPageHashR[hashVal] if isLeft: self.currFile = self.tempFileHashL[hashVal] self.currOutputPages = self.outputPageHashL[hashVal] allocatePage = not ( self.currOutputPages and (self.currOutputPages)[-1][1].header.hasFreeTuple()) if allocatePage: # Flush the most recently updated output page, which updates the storage file's # free page list to ensure correct new page allocation. if self.currOutputPages: self.storage.bufferPool.flushPage( (self.currOutputPages)[-1][0]) outputPageId = self.currFile.availablePage() outputPage = self.storage.bufferPool.getPage(outputPageId) self.currOutputPages.append((outputPageId, outputPage)) else: outputPage = (self.currOutputPages)[-1][1] outputPage.insertTuple(tupleData) if self.sampled: self.estimatedCardinality += 1 else: self.actualCardinality += 1 def printerr(self, string): f = open('err.txt', 'a') f.write(str(string) + '\n') f.close() # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + ["indexKeySchema=" + self.lhsKeySchema.toString()]))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join( filter(lambda x: x is not None, ([ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString(), "rhsKeySchema=" + self.rhsKeySchema.toString(), "lhsHashFn='" + self.lhsHashFn + "'", "rhsHashFn='" + self.rhsHashFn + "'" ]))) + ")" return super().explain() + exprs
from Storage.BufferPool import BufferPool from Catalog.Schema import DBSchema from Storage.StorageEngine import StorageEngine schemas = [ DBSchema('part', [('p_partkey', 'int'), ('p_name', 'char(55)'), ('p_mfgr', 'char(25)'), ('p_brand', 'char(10)'), ('p_type', 'char(25)'), ('p_size', 'int'), ('p_container', 'char(10)'), ('p_retailprice', 'float'), ('p_comment', 'char(23)')]) , DBSchema('supplier', [('s_suppkey', 'int'), ('s_name', 'char(25)'), ('s_address', 'char(40)'), ('s_nationkey', 'int'), ('s_phone', 'char(15)'), ('s_acctbal', 'float'), ('s_comment', 'char(101)')]) , DBSchema('partsupp', [('ps_partkey', 'int'), ('ps_suppkey', 'int'), ('ps_availqty', 'int'), ('ps_supplycost', 'float'), ('ps_comment', 'char(199)')])
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() def localCost(self, estimated): tupleSize = self.subPlan.schema().size numTuples = self.subPlan.cardinality(estimated) pageSize = self.storage.bufferPool.pageSize numPages = (tupleSize * numTuples) // pageSize return 2 * numTuples * self.tupleCost #return 2 * numPages #derived from: http://www4.comp.polyu.edu.hk/~csmlyiu/conf/CIKM09_skygroup.pdf with the assumption that G=1 and therefore the log value will be close to 1 # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete group-by specification, missing a required parameter") if not self.aggExprs: raise ValueError("Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.partitionFiles = {} self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Processing helpers def ensureTuple(self, x): if not isinstance(x, tuple): return (x,) else: return x def initialExprs(self): return [i[0] for i in self.aggExprs] def incrExprs(self): return [i[1] for i in self.aggExprs] def finalizeExprs(self): return [i[2] for i in self.aggExprs] # Set-at-a-time operator processing def processAllPages(self): # Create partitions of the input records by hashing the group-by values for (pageId, page) in self.subPlan: for tup in page: groupVal = self.ensureTuple(self.groupExpr(self.subSchema.unpack(tup))) groupId = self.groupHashFn(groupVal) self.emitPartitionTuple(groupId, tup) # We assume that the partitions fit in main memory. for partRelId in self.partitionFiles.values(): partFile = self.storage.fileMgr.relationFile(partRelId)[1] # Use an in-memory Python dict to accumulate the aggregates. aggregates = {} for (pageId, page) in partFile.pages(): for tup in page: # Evaluate group-by value. namedTup = self.subSchema.unpack(tup) groupVal = self.ensureTuple(self.groupExpr(namedTup)) # Look up the aggregate for the group. if groupVal not in aggregates: aggregates[groupVal] = self.initialExprs() # Increment the aggregate. aggregates[groupVal] = \ list(map( \ lambda x: x[0](x[1], namedTup), \ zip(self.incrExprs(), aggregates[groupVal]))) # Finalize the aggregate value for each group. for (groupVal, aggVals) in aggregates.items(): finalVals = list(map(lambda x: x[0](x[1]), zip(self.finalizeExprs(), aggVals))) outputTuple = self.outputSchema.instantiate(*(list(groupVal) + finalVals)) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Clean up partitions. self.removePartitionFiles() # Return an iterator for the output file. return self.storage.pages(self.relationId()) # Bucket construction helpers. def partitionRelationId(self, partitionId): return self.operatorType() + str(self.id()) + "_" \ + "part_" + str(partitionId) def emitPartitionTuple(self, partitionId, partitionTuple): partRelId = self.partitionRelationId(partitionId) # Create a partition file as needed. if not self.storage.hasRelation(partRelId): self.storage.createRelation(partRelId, self.subSchema) self.partitionFiles[partitionId] = partRelId partFile = self.storage.fileMgr.relationFile(partRelId)[1] if partFile: partFile.insertTuple(partitionTuple) # Delete all existing partition files. def removePartitionFiles(self): for partRelId in self.partitionFiles.values(): self.storage.removeRelation(partRelId) self.partitionFiles = {} # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
def initializeSchemas(self): tpchNamesAndFields = [ ('part', [('P_PARTKEY', 'int'), ('P_NAME', 'char(55)'), ('P_MFGR', 'char(25)'), ('P_BRAND', 'char(10)'), ('P_TYPE', 'char(25)'), ('P_SIZE', 'int'), ('P_CONTAINER', 'char(10)'), ('P_RETAILPRICE', 'double'), ('P_COMMENT', 'char(23)')], "issssisds"), ('supplier', [('S_SUPPKEY', 'int'), ('S_NAME', 'char(25)'), ('S_ADDRESS', 'char(40)'), ('S_NATIONKEY', 'int'), ('S_PHONE', 'char(15)'), ('S_ACCTBAL', 'double'), ('S_COMMENT', 'char(101)')], "issisds"), ('partsupp', [('PS_PARTKEY', 'int'), ('PS_SUPPKEY', 'int'), ('PS_AVAILQTY', 'int'), ('PS_SUPPLYCOST', 'double'), ('PS_COMMENT', 'char(199)')], "iiids"), ('customer', [('C_CUSTKEY', 'int'), ('C_NAME', 'char(25)'), ('C_ADDRESS', 'char(40)'), ('C_NATIONKEY', 'int'), ('C_PHONE', 'char(15)'), ('C_ACCTBAL', 'double'), ('C_MKTSEGMENT', 'char(10)'), ('C_COMMENT', 'char(117)')], "issisdss"), ( 'orders', [ ('O_ORDERKEY', 'int'), ('O_CUSTKEY', 'int'), ('O_ORDERSTATUS', 'char(1)'), ('O_TOTALPRICE', 'double'), ('O_ORDERDATE', 'int'), # date ('O_ORDERPRIORITY', 'char(15)'), ('O_CLERK', 'char(15)'), ('O_SHIPPRIORITY', 'int'), ('O_COMMENT', 'char(79)') ], "iisdtssis"), ( 'lineitem', [ ('L_ORDERKEY', 'int'), ('L_PARTKEY', 'int'), ('L_SUPPKEY', 'int'), ('L_LINENUMBER', 'int'), ('L_QUANTITY', 'double'), ('L_EXTENDEDPRICE', 'double'), ('L_DISCOUNT', 'double'), ('L_TAX', 'double'), ('L_RETURNFLAG', 'char(1)'), ('L_LINESTATUS', 'char(1)'), ('L_SHIPDATE', 'int'), # date ('L_COMMITDATE', 'int'), # date ('L_RECEIPTDATE', 'int'), # date ('L_SHIPINSTRUCT', 'char(25)'), ('L_SHIPMODE', 'char(10)'), ('L_COMMENT', 'char(44)') ], "iiiiddddsstttsss"), ('nation', [('N_NATIONKEY', 'int'), ('N_NAME', 'char(25)'), ('N_REGIONKEY', 'int'), ('N_COMMENT', 'char(152)')], "isis"), ('region', [('R_REGIONKEY', 'int'), ('R_NAME', 'char(25)'), ('R_COMMENT', 'char(152)')], "iss") ] self.schemas = dict( map(lambda x: (x[0], DBSchema(x[0], x[1])), tpchNamesAndFields)) self.parsers = dict( map(lambda x: (x[0], self.buildParser(x[2])), tpchNamesAndFields))
class Join(Operator): def __init__(self, lhsPlan, rhsPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined join operator not supported") self.lhsPlan = lhsPlan self.rhsPlan = rhsPlan self.joinExpr = kwargs.get("expr", None) self.joinMethod = kwargs.get("method", None) self.lhsSchema = kwargs.get("lhsSchema", None if lhsPlan is None else lhsPlan.schema()) self.rhsSchema = kwargs.get("rhsSchema", None if rhsPlan is None else rhsPlan.schema()) self.lhsKeySchema = kwargs.get("lhsKeySchema", None) self.rhsKeySchema = kwargs.get("rhsKeySchema", None) self.lhsHashFn = kwargs.get("lhsHashFn", None) self.rhsHashFn = kwargs.get("rhsHashFn", None) self.validateJoin() self.initializeSchema() self.initializeMethod(**kwargs) # Checks the join parameters. def validateJoin(self): # Valid join methods: "nested-loops", "block-nested-loops", "indexed", "hash" if self.joinMethod not in ["nested-loops", "block-nested-loops", "indexed", "hash"]: raise ValueError("Invalid join method in join operator") # Check all fields are valid. if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": methodParams = [self.joinExpr] elif self.joinMethod == "indexed": methodParams = [self.lhsKeySchema] elif self.joinMethod == "hash": methodParams = [self.lhsHashFn, self.lhsKeySchema, \ self.rhsHashFn, self.rhsKeySchema] requireAllValid = [self.lhsPlan, self.rhsPlan, \ self.joinMethod, \ self.lhsSchema, self.rhsSchema ] \ + methodParams if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete join specification, missing join operator parameter") # For now, we assume that the LHS and RHS schema have # disjoint attribute names, enforcing this here. for lhsAttr in self.lhsSchema.fields: if lhsAttr in self.rhsSchema.fields: raise ValueError("Invalid join inputs, overlapping schema detected") # Initializes the output schema for this join. # This is a concatenation of all fields in the lhs and rhs schema. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields) # Initializes any additional operator parameters based on the join method. def initializeMethod(self, **kwargs): if self.joinMethod == "indexed": self.indexId = kwargs.get("indexId", None) if self.indexId is None or self.lhsKeySchema is None: raise ValueError("Invalid index for use in join operator") # Returns the output schema of this operator def schema(self): return self.joinSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.lhsSchema, self.rhsSchema] # Returns a string describing the operator type def operatorType(self): readableJoinTypes = { 'nested-loops' : 'NL' , 'block-nested-loops' : 'BNL' , 'indexed' : 'Index' , 'hash' : 'Hash' } return readableJoinTypes[self.joinMethod] + "Join" # Returns child operators if present def inputs(self): return [self.lhsPlan, self.rhsPlan] # Iterator abstraction for join operator. def __iter__(self): self.initializeOutput() # Pipelined join operator is not supported according to constructor self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): if self.joinMethod == "nested-loops": return self.nestedLoops() elif self.joinMethod == "block-nested-loops": return self.blockNestedLoops() elif self.joinMethod == "indexed": return self.indexedNestedLoops() elif self.joinMethod == "hash": return self.hashJoin() else: raise ValueError("Invalid join method in join operator") # Return an iterator to the output relation def outputRelationIterator(self): return self.storage.pages(self.relationId()) ################################## # # Nested loops implementation # def nestedLoops(self): self.runNestedLoops(iter(self.lhsPlan), iter(self.rhsPlan), False, False, False) # Return an iterator to the output relation return self.outputRelationIterator() # Common function used by all types of joins def runNestedLoops(self, lhsPageIter, rhsPageIter, isBlock, isIndex, isHash): for (lPageId, lhsPage) in lhsPageIter: for lTuple in lhsPage: # Load the lhs once per inner loop. joinExprEnv = self.loadSchema(self.lhsSchema, lTuple) if isIndex: keyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) idxManager = self.storage.fileMgr.indexManager rhsPageIter = idxManager.lookupByIndex(self.indexId, keyData) for rhsItem in rhsPageIter: rhsTupleIter = None if isIndex: # Retrieve index-matched tuple from corresponding page page = self.storage.bufferPool.getPage(rhsItem.pageId) # rhsItem = rhsTupId rhsTupleIter = [page.getTuple(rhsItem)] else: # Need to scan all tuples rhsTupleIter = rhsItem[1] # rhsItem = (rPageId, rhsPage) for rTuple in rhsTupleIter: # Load the RHS tuple fields. joinExprEnv.update(self.loadSchema(self.rhsSchema, rTuple)) # Evaluate the join predicate, and output if we have a match. validJoin = False if isIndex: validJoin = True else: if isHash: lhsKeyData = self.lhsSchema.projectBinary(lTuple, self.lhsKeySchema) rhsKeyData = self.rhsSchema.projectBinary(rTuple, self.rhsKeySchema) validJoin = lhsKeyData == rhsKeyData else: validJoin = True if self.joinExpr: validJoin = validJoin and eval(self.joinExpr, globals(), joinExprEnv) if validJoin: outputTuple = self.joinSchema.instantiate(*[joinExprEnv[f] for f in self.joinSchema.fields]) self.emitOutputTuple(self.joinSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] if isBlock: self.storage.bufferPool.unpinPage(lPageId) ################################## # # Block nested loops implementation # # This attempts to use all the free pages in the buffer pool # for its block of the outer relation. # Accesses a block of pages from an iterator. # This method pins pages in the buffer pool during its access. # We track the page ids in the block to unpin them after processing the block. def accessPageBlock(self, bufPool, pageIterator): pinnedPages = list() try: while bufPool.numFreePages() > 0: (lPageId, lhsPage) = next(pageIterator) bufPool.pinPage(lPageId) pinnedPages.append((lPageId, lhsPage)) except StopIteration: pass return pinnedPages def pinPages(self, pageIterator): return self.accessPageBlock(self.storage.bufferPool, pageIterator) def blockNestedLoops(self): self.runBlockNestedLoops(iter(self.lhsPlan), self.rhsPlan, False) return self.outputRelationIterator() def runBlockNestedLoops(self, lhsPageIter, rhsPageIter, isHashJoin): pinnedPages = self.pinPages(lhsPageIter) # Keep running untill ALL pages have been loaded # Note: 'rhsPageIter' should be 'list' type NOT 'iter' while len(pinnedPages) > 0: self.runNestedLoops(iter(pinnedPages), rhsPageIter, True, False, isHashJoin) pinnedPages = self.pinPages(lhsPageIter) ################################## # # Indexed nested loops implementation # def indexedNestedLoops(self): self.runNestedLoops(iter(self.lhsPlan), None, False, True, False) return self.outputRelationIterator() ################################## # # Hash join implementation. # def hashJoin(self): lhsRelIdMap = {} rhsRelIdMap = {} # Partition each relation using hash function self.partition(self.lhsPlan, self.lhsHashFn, self.lhsSchema, lhsRelIdMap, "lhs") self.partition(self.rhsPlan, self.rhsHashFn, self.rhsSchema, rhsRelIdMap, "rhs") # Perform block nested loop join for each bucket for hashValue, relId in lhsRelIdMap.items(): lhsPageIter = self.storage.pages(relId) rhsPageIter = self.storage.pages(rhsRelIdMap[hashValue]) self.runBlockNestedLoops(lhsPageIter, list(rhsPageIter), True) # Remove partitions partitionIter = itertools.chain(lhsRelIdMap.items(), rhsRelIdMap.items()) for _, relId in partitionIter: self.storage.removeRelation(relId) return self.outputRelationIterator() # Partitions a given relation based on some hash function def partition(self, plan, hashFn, schema, relIdMap, relPrefix): for (pageId, page) in iter(plan): for tuple in page: # Compute hash value for every tuple fieldBindings = self.loadSchema(schema, tuple) hashValue = eval(hashFn, globals(), fieldBindings) # Store in temporary buckets (files) if not hashValue in relIdMap: relId = str(self.id()) + "_" + relPrefix + "_" + str(hashValue) self.storage.createRelation(relId, schema) relIdMap[hashValue] = relId self.storage.insertTuple(relIdMap[hashValue], tuple) # Plan and statistics information # Returns a single line description of the operator. def explain(self): if self.joinMethod == "nested-loops" or self.joinMethod == "block-nested-loops": exprs = "(expr='" + str(self.joinExpr) + "')" elif self.joinMethod == "indexed": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "indexKeySchema=" + self.lhsKeySchema.toString() ] ))) + ")" elif self.joinMethod == "hash": exprs = "(" + ','.join(filter(lambda x: x is not None, ( [ "expr='" + str(self.joinExpr) + "'" if self.joinExpr else None ] + [ "lhsKeySchema=" + self.lhsKeySchema.toString() , "rhsKeySchema=" + self.rhsKeySchema.toString() , "lhsHashFn='" + self.lhsHashFn + "'" , "rhsHashFn='" + self.rhsHashFn + "'" ] ))) + ")" return super().explain() + exprs
from customer, orders, lineitem where c_mktsegment = 'BUILDING' and c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate < 19950315 and l_shipdate > 19950315 group by l_orderkey, o_orderdate, o_shippriority ''' ls1 = DBSchema('customerKey1', [('C_CUSTKEY', 'int')]) rs1 = DBSchema('customerKey2', [('O_CUSTKEY', 'int')]) ls2 = DBSchema('orderKey1', [('O_ORDERKEY', 'int')]) rs2 = DBSchema('orderkey2', [('L_ORDERKEY', 'int')]) groupKeySchema = DBSchema('groupKey', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'int'), ('O_SHIPPRIORITY', 'int')]) groupAggSchema = DBSchema('groupAgg', [('revenue', 'float')]) query3 = db.query().fromTable('customer').join( \ db.query().fromTable('orders'), method = 'hash', \ lhsHashFn = 'hash(C_CUSTKEY) % 5', lhsKeySchema = ls1, \ rhsHashFn = 'hash(O_CUSTKEY) % 5', rhsKeySchema = rs1).join( \
import io, math, os, os.path, pickle, struct from struct import Struct from Catalog.Identifiers import PageId, FileId, TupleId from Catalog.Schema import DBSchema from Storage.Page import PageHeader, Page from Storage.SlottedPage import SlottedPageHeader, SlottedPage from Storage.File import * import shutil, Storage.BufferPool, Storage.FileManager schema = DBSchema('employee', [('id', 'int'), ('age', 'int')]) bp = Storage.BufferPool.BufferPool() fm = Storage.FileManager.FileManager(bufferPool=bp) bp.setFileManager(fm) fm.createRelation(schema.name, schema) (fId, f) = fm.relationFile(schema.name) f.numPages() == 0 pId = PageId(fId, 0) pId1 = PageId(fId, 1) p = Page(pageId=pId, buffer=bytes(f.pageSize()), schema=schema) p1 = Page(pageId=pId1, buffer=bytes(f.pageSize()), schema=schema)
def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.lhsSchema.schema() + self.rhsSchema.schema() self.joinSchema = DBSchema(schema, fields)
def joinsOptimizer(self, operator, aPaths): defaultScaleFactor = 10 defaultPartiNumber = 5 n = len(aPaths) planList = [] costList = [] # i = 1 for aPath in aPaths: # Here we define cost by number of pages. cards = Plan(root=aPath).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats(aPath.relationId()) numPages = cards / (pageSize / aPath.schema().size) # Here we only consider reorganize joins # so that we simple put accessPaths' total cost as 0. planList.append(aPath) costList.append((numPages, 0)) # i = 2...n for i in range(1, n): # find all possible two way join in current planList # put the potential joins in potentialP # put the potential joins cost in potentialC m = len(planList) potentialP = [] potentialC = [] for j in range(0, m - 1): for k in range(j + 1, m): self.pcntr += 1 potentialP.append((planList[j], planList[k])) potentialC.append(3 * (costList[j][0] + costList[k][0]) + costList[j][1] + costList[k][1]) # find the cheapest joinable join (total cost) # build the join, remove the used two base plan and add the new join to planList # modify the costList as well while (potentialC): currC = min(potentialC) currP = potentialP[potentialC.index(currC)] potentialC.remove(currC) potentialP.remove(currP) if (self.joinable(operator, currP)): (lField, rField) = self.joinable(operator, currP) lhsSchema = currP[0].schema() rhsSchema = currP[1].schema() lKeySchema = DBSchema( 'left', [(f, t) for (f, t) in lhsSchema.schema() if f == lField]) rKeySchema = DBSchema( 'right', [(f, t) for (f, t) in rhsSchema.schema() if f == rField]) lHashFn = 'hash(' + lField + ') % ' + str( defaultPartiNumber) rHashFn = 'hash(' + rField + ') % ' + str( defaultPartiNumber) newJoin = Join(currP[0], currP[1], method='hash', \ lhsHashFn=lHashFn, lhsKeySchema=lKeySchema, \ rhsHashFn=rHashFn, rhsKeySchema=rKeySchema) newJoin.prepare(self.db) totalCost = currC cards = Plan(root=newJoin).sample(defaultScaleFactor) pageSize, _, _ = self.db.storage.relationStats( newJoin.relationId()) pages = cards / (pageSize / newJoin.schema().size) id1 = planList.index(currP[0]) _ = planList.pop(id1) id2 = planList.index(currP[1]) _ = planList.pop(id2) planList.append(newJoin) _ = costList.pop(id1) _ = costList.pop(id2) costList.append((pages, totalCost)) break print("GreedyOptimizer plan considered: ", self.pcntr) return planList[0]
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError( "Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError( "Incomplete group-by specification, missing a required parameter" ) if not self.aggExprs: raise ValueError( "Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() # Pipelined join operator is not supported self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): Map = dict() # partition the schema into several files in different attributes self.partition(Map) for key, title in Map.items(): # Generate a pageIterator in the file pageIterator = self.storage.pages(title) # Generate an dictionary on intermediate aggregation results aggregator = {} # Get the tuple in the page for _, page in pageIterator: for Tuple in page: tuple_Unpacked = self.subSchema.unpack(Tuple) key = self.groupExpr(tuple_Unpacked) if type(key) is tuple: key = key else: key = key, val = self.groupHashFn(key) intermediate_results = aggregator.get(val, None) # if the intermediate_result has not generated, form one if not intermediate_results: intermediate_results = list() aggregator[val] = intermediate_results for aggExpr in self.aggExprs: intermediate_results.append(aggExpr[0]) index = 0 # Perform the aggregation function for aggExpr in self.aggExprs: intermediate_result = intermediate_results[index] intermediate_results[index] = aggExpr[1]( intermediate_result, tuple_Unpacked) index += 1 for val, intermediate_results in aggregator.items(): index = 0 for aggExpr in self.aggExprs: intermediate_result = intermediate_results[index] intermediate_results[index] = aggExpr[2]( intermediate_result) index += 1 outputList = itertools.chain([val], intermediate_results) outputTuple = self.outputSchema.instantiate(*outputList) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) if self.outputPages: self.outputPages = [self.outputPages[-1]] # remove the temporary relation created for _, title in Map.items(): self.storage.removeRelation(title) return self.storage.pages(self.relationId()) def partition(self, relMap): for (pageId, page) in iter(self.subPlan): for Tuple in page: tuple_Unpacked = self.subSchema.unpack(Tuple) key = self.groupExpr(tuple_Unpacked) if type(key) is tuple: key = key else: key = key, value = self.groupHashFn(key) # if this key is not in relation map, we should create a temperory file to contain these tuples with this key if not value in relMap: title = str(self.id()) + "_grp_" + str(value) self.storage.createRelation(title, self.subSchema) relMap[value] = title self.storage.insertTuple(relMap[value], Tuple) # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
class GroupBy(Operator): def __init__(self, subPlan, **kwargs): super().__init__(**kwargs) if self.pipelined: raise ValueError("Pipelined group-by-aggregate operator not supported") self.subPlan = subPlan self.subSchema = subPlan.schema() self.groupSchema = kwargs.get("groupSchema", None) self.aggSchema = kwargs.get("aggSchema", None) self.groupExpr = kwargs.get("groupExpr", None) self.aggExprs = kwargs.get("aggExprs", None) self.groupHashFn = kwargs.get("groupHashFn", None) self.validateGroupBy() self.initializeSchema() # Perform some basic checking on the group-by operator's parameters. def validateGroupBy(self): requireAllValid = [self.subPlan, \ self.groupSchema, self.aggSchema, \ self.groupExpr, self.aggExprs, self.groupHashFn ] if any(map(lambda x: x is None, requireAllValid)): raise ValueError("Incomplete group-by specification, missing a required parameter") if not self.aggExprs: raise ValueError("Group-by needs at least one aggregate expression") if len(self.aggExprs) != len(self.aggSchema.fields): raise ValueError("Invalid aggregate fields: schema mismatch") # Initializes the group-by's schema as a concatenation of the group-by # fields and all aggregate fields. def initializeSchema(self): schema = self.operatorType() + str(self.id()) fields = self.groupSchema.schema() + self.aggSchema.schema() self.outputSchema = DBSchema(schema, fields) # Returns the output schema of this operator def schema(self): return self.outputSchema # Returns any input schemas for the operator if present def inputSchemas(self): return [self.subPlan.schema()] # Returns a string describing the operator type def operatorType(self): return "GroupBy" # Returns child operators if present def inputs(self): return [self.subPlan] # Iterator abstraction for selection operator. def __iter__(self): self.initializeOutput() self.outputIterator = self.processAllPages() return self def __next__(self): return next(self.outputIterator) # Page-at-a-time operator processing def processInputPage(self, pageId, page): raise ValueError("Page-at-a-time processing not supported for joins") # Set-at-a-time operator processing def processAllPages(self): relIdMap = {} # Perform partition using hash function self.partition(relIdMap) # Perform group-by operation for hashValue, relId in relIdMap.items(): pageIter = self.storage.pages(relId) aggregationResults = {} # Stores intermediate aggregation results for _, page in pageIter: for tupleP in page: tupleU = self.subSchema.unpack(tupleP) gbVal = self.getGroupByValue(tupleU) # Get intermediate results for this group-by value intermediateResults = aggregationResults.get(gbVal, None) if intermediateResults is None: intermediateResults = list() aggregationResults[gbVal] = intermediateResults for aggExpr in self.aggExprs: # Form a list of initial values intermediateResults.append(aggExpr[0]) idx = 0 for aggExpr in self.aggExprs: # Perform aggregation by applying the lambda function (aggExpr[1]) intermediateResult = intermediateResults[idx] intermediateResults[idx] = aggExpr[1](intermediateResult, tupleU) idx = idx + 1 for gbVal, intermediateResults in aggregationResults.items(): idx = 0 for aggExpr in self.aggExprs: # Perform final step by applying the lambda function (aggExpr[2]) intermediateResult = intermediateResults[idx] intermediateResults[idx] = aggExpr[2](intermediateResult) idx = idx + 1 outputList = itertools.chain([gbVal[0]], intermediateResults) outputTuple = self.outputSchema.instantiate(*outputList) self.emitOutputTuple(self.outputSchema.pack(outputTuple)) # No need to track anything but the last output page when in batch mode. if self.outputPages: self.outputPages = [self.outputPages[-1]] # Remove partitions for _, relId in relIdMap.items(): self.storage.removeRelation(relId) return self.storage.pages(self.relationId()) # Partitions a given relation based on some hash function def partition(self, relIdMap): for (pageId, page) in iter(self.subPlan): for tupleP in page: # Compute hash value for every tuple tupleU = self.subSchema.unpack(tupleP) hashVal = self.groupHashFn(self.getGroupByValue(tupleU)) # Store in temporary buckets (files) if not hashVal in relIdMap: relId = str(self.id()) + "_grp_" + str(hashVal) self.storage.createRelation(relId, self.subSchema) relIdMap[hashVal] = relId self.storage.insertTuple(relIdMap[hashVal], tupleP) def getGroupByValue(self, unpackedTuple): gbVal = self.groupExpr(unpackedTuple) return gbVal if type(gbVal) is tuple else gbVal, # Plan and statistics information # Returns a single line description of the operator. def explain(self): return super().explain() + "(groupSchema=" + self.groupSchema.toString() \ + ", aggSchema=" + self.aggSchema.toString() + ")"
def tests(printOutput): db = setup() '''select sum(l_extendedprice * l_discount) as revenue from lineitem where l_shipdate >= 19940101 and l_shipdate < 19950101 and l_discount between 0.06 - 0.01 and 0.06 + 0.01 and l_quantity < 24''' query1 = db.query().fromTable('lineitem') \ .where('L_SHIPDATE >= 19940101') \ .where('L_SHIPDATE < 19950101') \ .where('L_DISCOUNT > 0.06 - 0.01') \ .where('L_DISCOUNT < 0.06 + 0.01') \ .where('L_QUANTITY < 24') \ .groupBy(groupSchema=DBSchema('REV', [('dum', 'int')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (1)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * e.L_DISCOUNT, lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'REVENUE': ('SUM', 'double')}) \ .finalize() '''select sum(l_extendedprice * (1 - l_discount)) as promo_revenue from lineitem, part where l_partkey = p_partkey and l_shipdate >= 19950901 and l_shipdate < 19951001''' query2 = db.query().fromTable('lineitem') \ .join(db.query().fromTable('part'), rhsSchema=db.relationSchema('part'), method='block-nested-loops', expr='L_PARTKEY == P_PARTKEY') \ .where('L_SHIPDATE >= 19950901') \ .where('L_SHIPDATE < 19951001') \ .groupBy(groupSchema=DBSchema('REV', [('dum', 'int')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (1)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'PROMO_REVENUE': ('SUM', 'double')}) \ .finalize() '''select l_orderkey, sum(l_extendedprice * (1 - l_discount)) as revenue, o_orderdate, o_shippriority from customer, orders, lineitem where c_mktsegment = 'BUILDING' and c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate < 19950315 and l_shipdate > 19950315 group by l_orderkey, o_orderdate, o_shippriority''' query3 = db.query().fromTable('customer') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='C_CUSTKEY == O_CUSTKEY') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .where('L_SHIPDATE == \'BUILDING\'') \ .where('O_ORDERDATE < 19950315') \ .where('L_SHIPDATE > 19950315') \ .groupBy(groupSchema=DBSchema('FIRST', [('L_ORDERKEY', 'int'), ('O_ORDERDATE', 'char(10)'), ('O_SHIPPRIORITY', 'int')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'L_ORDERKEY': ('L_ORDERKEY', 'int'), 'REVENUE': ('SUM', 'double'),'O_ORDERDATE': ('O_ORDERDATE', 'char(10)'),'O_SHIPPRIORITY': ('O_SHIPPRIORITY', 'int')}) \ .finalize() '''select c_custkey, c_name, sum(l_extendedprice * (1 - l_discount)) as revenue, c_acctbal, n_name, c_address, c_phone, c_comment from customer, orders, lineitem, nation where c_custkey = o_custkey and l_orderkey = o_orderkey and o_orderdate >= 19931001 and o_orderdate < 19940101 and l_returnflag = 'R' and c_nationkey = n_nationkey group by c_custkey, c_name, c_acctbal, c_phone, n_name, c_address, c_comment''' query4 = db.query().fromTable('customer') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='C_CUSTKEY == O_CUSTKEY') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .where('L_SHIPDATE == \'BUILDING\'') \ .where('O_ORDERDATE >= 19931001') \ .where('O_ORDERDATE < 19940101') \ .where('L_RETURNFLAG == \'R\'') \ .join(db.query().fromTable('nation'), rhsSchema=db.relationSchema('nation'), method='block-nested-loops', expr='C_NATIONKEY == N_NATIONKEY') \ .groupBy(groupSchema=DBSchema('FIRST', [('C_CUSTKEY', 'int'), ('C_NAME', 'char(25)'), ('C_ACCTBAL', 'double'), ('C_PHONE', 'char(15)'), ('N_NAME', 'char(25)'),('C_ADDRESS', 'char(40)'),('C_COMMENT', 'char(117)')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.C_CUSTKEY, e.C_NAME, e.C_ACCTBAL, e.C_PHONE, e.N_NAME, e.C_ADDRESS, e.C_PHONE, e.C_COMMENT)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'C_CUSTKEY' : ('C_CUSTKEY', 'int'), 'C_NAME' : ('C_NAME', 'char(25)'),'SUM' : ('SUM', 'double'), 'C_ACCTBAL' : ('C_ACCTBAL', 'double'), 'C_PHONE' : ('C_PHONE', 'char(15)'), 'N_NAME' : ('N_NAME', 'char(25)'),'C_ADDRESS' : ('C_ADDRESS', 'char(40)'),'C_COMMENT' : ('C_COMMENT', 'char(117)')}) \ .finalize() '''select n_name, sum(l_extendedprice * (1 - l_discount)) as revenue from customer, orders, lineitem, supplier, nation, region where c_custkey = o_custkey and l_orderkey = o_orderkey and l_suppkey = s_suppkey and c_nationkey = s_nationkey and s_nationkey = n_nationkey and n_regionkey = r_regionkey and r_name = 'ASIA' and o_orderdate >= 19940101 and o_orderdate < 19950101 group by n_name''' query5 = db.query().fromTable('customer') \ .join(db.query().fromTable('orders'), rhsSchema=db.relationSchema('orders'), method='block-nested-loops', expr='C_CUSTKEY == O_CUSTKEY') \ .join(db.query().fromTable('lineitem'), rhsSchema=db.relationSchema('lineitem'), method='block-nested-loops', expr='L_ORDERKEY == O_ORDERKEY') \ .join(db.query().fromTable('supplier'), rhsSchema=db.relationSchema('supplier'), method='block-nested-loops', expr='L_SUPPKEY == S_SUPPKEY and C_NATIONKEY == S_NATIONKEY') \ .join(db.query().fromTable('nation'), rhsSchema=db.relationSchema('nation'), method='block-nested-loops', expr='S_NATIONKEY == N_NATIONKEY') \ .join(db.query().fromTable('region'), rhsSchema=db.relationSchema('region'), method='block-nested-loops', expr='N_REGIONKEY == R_REGIONKEY') \ .where('L_SHIPDATE == \'BUILDING\'') \ .where('R_NAME == \'ASIA\'') \ .where('O_ORDERDATE >= 19940101') \ .where('O_ORDERDATE < 19950101') \ .groupBy(groupSchema=DBSchema('FIRST', [('N_NAME', 'char(25)')]), aggSchema=DBSchema('SUM', [('SUM', 'double')]), groupExpr=(lambda e: (e.L_ORDERKEY, e.O_ORDERDATE, e.O_SHIPPRIORITY)), aggExprs=[(0, lambda acc, e: acc + e.L_EXTENDEDPRICE * (1 - e.L_DISCOUNT), lambda x: x)], groupHashFn=(lambda gbVal: hash(gbVal[0]) % 111)) \ .select({'N_NAME': ('N_NAME', 'char(25)'), 'SUM' : ('SUM', 'double')}) \ .finalize() queries = [query1, query2, query3, query4, query5] names = ['query1', 'query2', 'query3', 'query4', 'query5'] opt = Optimizer.Optimizer(db) for i,q in enumerate(queries): querytest(db, printOutput, q, names[i], opt)
from Storage.Page import Page from Storage.SlottedPage import SlottedPage from Storage.File import StorageFile from Storage.FileManager import FileManager from Storage.BufferPool import BufferPool from Catalog.Identifiers import FileId, PageId, TupleId from Catalog.Schema import DBSchema import sys import unittest # A schema to work with schema = DBSchema('employee', [('id', 'int'), ('age', 'int')]) # Initialize a bufferpool and filemanager def makeDB(): bp = BufferPool() fm = FileManager(bufferPool=bp) bp.setFileManager(fm) return (bp, fm) # Make an employee def makeEmployee(n): return schema.instantiate(n, 25 + n) if __name__ == "__main__": print("Creating") (bp, fm) = makeDB() fm.createRelation(schema.name, schema) (fId, f) = fm.relationFile(schema.name)