def yield_rows(self):
     row = SummaryRow()
     isect_time = 0.0
     for item in Utils.combinations(*self.condcols.veckey_pairs(self.zeros)):
         row.level = len(item)
         row.suppress = False
         if row.level in self.levels:
             row.type_string = ['0'] * len(self.condcols)
             colnames = []
             colvalues = []
             intersect_rows = []
             for var_val, var_rows, suppress, condcol in item:
                 row.type_string[condcol.index] = '1'
                 intersect_rows.append(var_rows)
                 colnames.append(condcol.name)
                 colvalues.append(var_val)
                 if suppress:
                     row.suppress = True
             isect_start = time.time()
             if len(intersect_rows) == 0:
                 row.count = len(self.filtered_ds)
                 row.extract = self.filtered_ds
             else:
                 if len(intersect_rows) == 1:
                     cellrows = intersect_rows[0]
                 else:
                     cellrows = soomfunc.intersect(*intersect_rows)
                 row.count = len(cellrows)
                 row.extract = DatasetTake(self.dataset, cellrows)
             isect_time += time.time() - isect_start
             row.colnames = tuple(colnames)
             row.colvalues = tuple(colvalues)
             yield row
     soom.info('Summarise intersect() time: %.3f' % isect_time)
Esempio n. 2
0
 def near(self, lhsrowwords, rhsrowwords, mask=_BEFORE | _AFTER):
     # only check matching rows
     rows = soomfunc.intersect(lhsrowwords[0], rhsrowwords[0])
     lhswords = lhsrowwords[1]
     rhswords = rhsrowwords[1]
     words = {}
     _BEFORE = self._BEFORE
     _AFTER = self._AFTER
     nearness = self.nearness
     for row in rows:
         hits = sets.Set()
         # this is O(n*n) and could be improved
         # find all hits and then remove duplicates
         for left in lhswords[row]:
             for right in rhswords[row]:
                 if (mask & _BEFORE) and right - nearness <= left <= right:
                     hits.add(left)
                     hits.add(right)
                 if (mask & _AFTER) and right <= left <= right + nearness:
                     hits.add(left)
                     hits.add(right)
         if hits:
             hits = list(hits)
             hits.sort()
             words[row] = Numeric.array(hits, Numeric.Int)
     # remove rows that have no hits left
     rows = Numeric.array(filter(lambda r: r in words, rows), Numeric.Int)
     return rows, words
Esempio n. 3
0
 def yield_rows(self):
     row = SummaryRow()
     isect_time = 0.0
     for item in Utils.combinations(
             *self.condcols.veckey_pairs(self.zeros)):
         row.level = len(item)
         row.suppress = False
         if row.level in self.levels:
             row.type_string = ['0'] * len(self.condcols)
             colnames = []
             colvalues = []
             intersect_rows = []
             for var_val, var_rows, suppress, condcol in item:
                 row.type_string[condcol.index] = '1'
                 intersect_rows.append(var_rows)
                 colnames.append(condcol.name)
                 colvalues.append(var_val)
                 if suppress:
                     row.suppress = True
             isect_start = time.time()
             if len(intersect_rows) == 0:
                 row.count = len(self.filtered_ds)
                 row.extract = self.filtered_ds
             else:
                 if len(intersect_rows) == 1:
                     cellrows = intersect_rows[0]
                 else:
                     cellrows = soomfunc.intersect(*intersect_rows)
                 row.count = len(cellrows)
                 row.extract = DatasetTake(self.dataset, cellrows)
             isect_time += time.time() - isect_start
             row.colnames = tuple(colnames)
             row.colvalues = tuple(colvalues)
             yield row
     soom.info('Summarise intersect() time: %.3f' % isect_time)
Esempio n. 4
0
 def factor(self):
     comparison = self.comparison()
     f = comparison
     while self._peek('"and"', '"or"', 'END', '"\\\\)"') == '"and"':
         self._scan('"and"')
         comparison = self.comparison()
         f = soomfunc.intersect(f, comparison)
     return f
Esempio n. 5
0
 def factor(self):
     comparison = self.comparison()
     f = comparison
     while self._peek('"and"', '"or"', 'END', '"\\\\)"') == '"and"':
         self._scan('"and"')
         comparison = self.comparison()
         f = soomfunc.intersect(f, comparison)
     return f
Esempio n. 6
0
 def intersect(self, lhsrowwords, rhsrowwords):
     # find matching rows
     rows = soomfunc.intersect(lhsrowwords[0], rhsrowwords[0])
     lhswords = lhsrowwords[1]
     rhswords = rhsrowwords[1]
     # almalgamate word hits
     words = {}
     for r in rows:
         words[r] = soomfunc.union(lhswords[r], rhswords[r])
     return rows, words
Esempio n. 7
0
 def op_between(self, value, filter_keys):
     try:
         start, end = value
     except (ValueError, TypeError):
         raise ExpressionError('between(start, end)')
     if type(self.data) is MA.MaskedArray:
         resmap_ge = greater_equal(self.data, start).filled()
         resmap_lt = less(self.data, end).filled()
     else:
         resmap_ge = Numeric.greater_equal(self.data, start)
         resmap_lt = Numeric.less(self.data, end)
     vectors = soomfunc.intersect(Numeric.nonzero(resmap_ge), 
                                  NUmeric.nonzero(resmap_lt))
     return vectors
Esempio n. 8
0
 def follow(self, lhsrowwords, rhsrowwords):
     # find matching rows
     rows = soomfunc.intersect(lhsrowwords[0], rhsrowwords[0])
     lhswords = lhsrowwords[1]
     rhswords = rhsrowwords[1]
     # almalgamate word hits
     words = {}
     for row in rows:
         hits = sets.Set()
         # this is O(n*n) and could be improved
         # find all hits and remove duplicates
         for left in lhswords[row]:
             for right in rhswords[row]:
                 if right == left + 1:
                     hits.add(left)
                     hits.add(right)
         if hits:
             hits = list(hits)
             hits.sort()
             words[row] = Numeric.array(hits, Numeric.Int)
     # remove rows that have no hits left
     rows = Numeric.array(filter(lambda r: r in words, rows), Numeric.Int)
     return rows, words
Esempio n. 9
0
 def get_inverted(self):
     inverted = {}
     for value in self._src_col.inverted.keys():
         inverted[value] = soomfunc.intersect(self._src_col.inverted[value], 
                                              self._all_record_ids)
     return inverted
Esempio n. 10
0
                print "  >%8d: %d" % (pow(10, v) - 1, c)
        

if 1:
    calls = 0
    result = []
    vl = stats('vector length distribution')
    cvl = stats('cache vector length distribution')
    for veckey in combinations(*values):
        if len(veckey) > 0:
            values, rows = zip(*veckey)
            if len(rows) == 1:
                rows = rows[0]
            else:
                calls += 1
                rows = intersect(*rows)
                vl.add(len(rows))
            if len(values) > 1 and len(values) < len(cols):
                cvl.add(len(rows))
#            result.append((values, rows))
    elapsed = time() - st
    print '%d intersect calls, %.3f sec, %.3f sec per call' % (
        calls, elapsed, elapsed / calls)
    vl.report()
    cvl.report()

if 0:
    calls = 0
    cache = {}
    for veckey in combinations(*values):
        if len(veckey) > 0:
Esempio n. 11
0
                print "  >%8d: %d" % (pow(10, v) - 1, c)


if 1:
    calls = 0
    result = []
    vl = stats('vector length distribution')
    cvl = stats('cache vector length distribution')
    for veckey in combinations(*values):
        if len(veckey) > 0:
            values, rows = zip(*veckey)
            if len(rows) == 1:
                rows = rows[0]
            else:
                calls += 1
                rows = intersect(*rows)
                vl.add(len(rows))
            if len(values) > 1 and len(values) < len(cols):
                cvl.add(len(rows))
#            result.append((values, rows))
    elapsed = time() - st
    print '%d intersect calls, %.3f sec, %.3f sec per call' % (calls, elapsed,
                                                               elapsed / calls)
    vl.report()
    cvl.report()

if 0:
    calls = 0
    cache = {}
    for veckey in combinations(*values):
        if len(veckey) > 0:
Esempio n. 12
0
 def get_inverted(self):
     inverted = {}
     for value in self._src_col.inverted.keys():
         inverted[value] = soomfunc.intersect(self._src_col.inverted[value],
                                              self._all_record_ids)
     return inverted
Esempio n. 13
0
def isect(a, b):  # redundant, I think.
    return soomfunc.intersect(vector1_val1, vector2_val2)