def joinSubPred(self, sch1, sch2): result = Predicate() newsch = Schema() newsch.addAll(sch1) newsch.addAll(sch2) for t in self.terms: if not t.appliesTo(sch1) and not t.appliesTo(sch2) and t.appliesTo( newsch): result.terms.append(t) if len(result.terms) == 0: return None else: return result
class ProductPlan(Plan): # # * Creates a new product node in the query tree, # * having the two specified subqueries. # * @param p1 the left-hand subquery # * @param p2 the right-hand subquery # def __init__(self, p1, p2): super(ProductPlan, self).__init__() self.p1 = p1 self.p2 = p2 self._schema = Schema() self._schema.addAll(p1.schema()) self._schema.addAll(p2.schema()) # # * Creates a product scan for this query. # * @see Plan#open() # def open(self): s1 = self.p1.open() s2 = self.p2.open() return ProductScan(s1, s2) # # * Estimates the number of block accesses in the product. # * The formula is: # * <pre> B(product(p1,p2)) = B(p1) + R(p1)*B(p2) </pre> # * @see Plan#blocksAccessed() # def blocksAccessed(self): return self.p1.blocksAccessed() + (self.p1.recordsOutput() * self.p2.blocksAccessed()) # # * Estimates the number of output records in the product. # * The formula is: # * <pre> R(product(p1,p2)) = R(p1)*R(p2) </pre> # * @see Plan#recordsOutput() # def recordsOutput(self): return self.p1.recordsOutput() * self.p2.recordsOutput() # # * Estimates the distinct number of field values in the product. # * Since the product does not increase or decrease field values, # * the estimate is the same as in the appropriate underlying query. # * @see Plan#distinctValues(String) # def distinctValues(self, fldname): if self.p1.schema().hasField(fldname): return self.p1.distinctValues(fldname) else: return self.p2.distinctValues(fldname) # # * Returns the schema of the product, # * which is the union of the schemas of the underlying queries. # * @see Plan#schema() # def schema(self): return self._schema
class MultibufferProductPlan(Plan): # # * Creates a product plan for the specified queries. # * @param lhs the plan for the LHS query # * @param rhs the plan for the RHS query # * @param tx the calling transaction # def __init__(self, tx, lhs, rhs): super(MultibufferProductPlan, self).__init__() self.tx = tx self.lhs = MaterializePlan(tx, lhs) self.rhs = rhs self.schema = Schema() self.schema.addAll(lhs.schema()) self.schema.addAll(rhs.schema()) # # * A scan for this query is created and returned, as follows. # * First, the method materializes its LHS and RHS queries. # * It then determines the optimal chunk size, # * based on the size of the materialized RHS file and the # * number of available buffers. # * It creates a chunk plan for each chunk, saving them in a list. # * Finally, it creates a multiscan for this list of plans, # * and returns that scan. # * @see Plan#open() # def open(self): leftscan = self.lhs.open() tt = self.copyRecordsFrom(self.rhs) return MultibufferProductScan(self.tx, leftscan, tt.tableName(), tt.getLayout()) # # * Returns an estimate of the number of block accesses # * required to execute the query. The formula is: # * <pre> B(product(p1,p2)) = B(p2) + B(p1)*C(p2) </pre> # * where C(p2) is the number of chunks of p2. # * The method uses the current number of available buffers # * to calculate C(p2), and so this value may differ # * when the query scan is opened. # * @see Plan#blocksAccessed() # def blocksAccessed(self): # this guesses at the # of chunks avail = self.tx.availableBuffs() size = MaterializePlan(self.tx, self.rhs).blocksAccessed() numchunks = int(size / avail) return self.rhs.blocksAccessed() + (self.lhs.blocksAccessed() * numchunks) # # * Estimates the number of output records in the product. # * The formula is: # * <pre> R(product(p1,p2)) = R(p1)*R(p2) </pre> # * @see Plan#recordsOutput() # def recordsOutput(self): return self.lhs.recordsOutput() * self.rhs.recordsOutput() # # * Estimates the distinct number of field values in the product. # * Since the product does not increase or decrease field values, # * the estimate is the same as in the appropriate underlying query. # * @see Plan#distinctValues(String) # def distinctValues(self, fldname): if self.lhs.schema().hasField(fldname): return self.lhs.distinctValues(fldname) else: return self.rhs.distinctValues(fldname) # # * Returns the schema of the product, # * which is the union of the schemas of the underlying queries. # * @see Plan#schema() # def schema(self): return self.schema def copyRecordsFrom(self, p): src = p.open() sch = p.schema() t = TempTable(self.tx, sch) dest = t.open() while src.next(): dest.insert() for fldname in sch.fields(): dest.setVal(fldname, src.getVal(fldname)) src.close() dest.close() return t
class MergeJoinPlan(Plan): # # * Creates a mergejoin plan for the two specified queries. # * The RHS must be materialized after it is sorted, # * in order to deal with possible duplicates. # * @param p1 the LHS query plan # * @param p2 the RHS query plan # * @param fldname1 the LHS join field # * @param fldname2 the RHS join field # * @param tx the calling transaction # def __init__(self, tx, p1, p2, fldname1, fldname2): super(MergeJoinPlan, self).__init__() self.fldname1 = fldname1 sortlist1 = [fldname1] self.p1 = SortPlan(tx, p1, sortlist1) self.fldname2 = fldname2 sortlist2 = [fldname2] self.p2 = SortPlan(tx, p2, sortlist2) self.sch = Schema() self.sch.addAll(p1.schema()) self.sch.addAll(p2.schema()) # The method first sorts its two underlying scans # * on their join field. It then returns a mergejoin scan # * of the two sorted table scans. # * @see Plan#open() # def open(self): s1 = self.p1.open() s2 = self.p2.open() return MergeJoinScan(s1, s2, self.fldname1, self.fldname2) # # * Return the number of block acceses required to # * mergejoin the sorted tables. # * Since a mergejoin can be preformed with a single # * pass through each table, the method returns # * the sum of the block accesses of the # * materialized sorted tables. # * It does <i>not</i> include the one-time cost # * of materializing and sorting the records. # * @see Plan#blocksAccessed() # def blocksAccessed(self): return self.p1.blocksAccessed() + self.p2.blocksAccessed() # # * Return the number of records in the join. # * Assuming uniform distribution, the formula is: # * <pre> R(join(p1,p2)) = R(p1)*R(p2)/max{V(p1,F1),V(p2,F2)}</pre> # * @see Plan#recordsOutput() # def recordsOutput(self): maxvals = max(self.p1.distinctValues(self.fldname1), self.p2.distinctValues(self.fldname2)) return int( (self.p1.recordsOutput() * self.p2.recordsOutput()) / maxvals) # # * Estimate the distinct number of field values in the join. # * Since the join does not increase or decrease field values, # * the estimate is the same as in the appropriate underlying query. # * @see Plan#distinctValues(String) # def distinctValues(self, fldname): if self.p1.schema().hasField(fldname): return self.p1.distinctValues(fldname) else: return self.p2.distinctValues(fldname) # # * Return the schema of the join, # * which is the union of the schemas of the underlying queries. # * @see Plan#schema() # def schema(self): return self.sch
class IndexJoinPlan(Plan): # # * Implements the join operator, # * using the specified LHS and RHS plans. # * @param p1 the left-hand plan # * @param p2 the right-hand plan # * @param ii information about the right-hand index # * @param joinfield the left-hand field used for joining # def __init__(self, p1, p2, ii, joinfield): super(IndexJoinPlan, self).__init__() self.p1 = p1 self.p2 = p2 self.ii = ii self.joinfield = joinfield self.sch = Schema() self.sch.addAll(p1.schema()) self.sch.addAll(p2.schema()) # # * Opens an indexjoin scan for this query # * @see Plan#open() # def open(self): s = self.p1.open() # throws an exception if p2 is not a tableplan ts = self.p2.open() idx = self.ii.open() return IndexJoinScan(s, idx, self.joinfield, ts) # # * Estimates the number of block accesses to compute the join. # * The formula is: # * <pre> B(indexjoin(p1,p2,idx)) = B(p1) + R(p1)*B(idx) # * + R(indexjoin(p1,p2,idx) </pre> # * @see Plan#blocksAccessed() # def blocksAccessed(self): return self.p1.blocksAccessed() + (self.p1.recordsOutput() * self.ii.blocksAccessed()) + self.recordsOutput() # # * Estimates the number of output records in the join. # * The formula is: # * <pre> R(indexjoin(p1,p2,idx)) = R(p1)*R(idx) </pre> # * @see Plan#recordsOutput() # def recordsOutput(self): return self.p1.recordsOutput() * self.ii.recordsOutput() # # * Estimates the number of distinct values for the # * specified field. # * @see Plan#distinctValues(String) # def distinctValues(self, fldname): if self.p1.schema().hasField(fldname): return self.p1.distinctValues(fldname) else: return self.p2.distinctValues(fldname) # # * Returns the schema of the index join. # * @see Plan#schema() # def schema(self): return self.sch