def yield_rows(self): row = SummaryRow() isect_time = 0.0 for item in Utils.combinations(*self.condcols.veckey_pairs(self.zeros)): row.level = len(item) row.suppress = False if row.level in self.levels: row.type_string = ['0'] * len(self.condcols) colnames = [] colvalues = [] intersect_rows = [] for var_val, var_rows, suppress, condcol in item: row.type_string[condcol.index] = '1' intersect_rows.append(var_rows) colnames.append(condcol.name) colvalues.append(var_val) if suppress: row.suppress = True isect_start = time.time() if len(intersect_rows) == 0: row.count = len(self.filtered_ds) row.extract = self.filtered_ds else: if len(intersect_rows) == 1: cellrows = intersect_rows[0] else: cellrows = soomfunc.intersect(*intersect_rows) row.count = len(cellrows) row.extract = DatasetTake(self.dataset, cellrows) isect_time += time.time() - isect_start row.colnames = tuple(colnames) row.colvalues = tuple(colvalues) yield row soom.info('Summarise intersect() time: %.3f' % isect_time)
def near(self, lhsrowwords, rhsrowwords, mask=_BEFORE | _AFTER): # only check matching rows rows = soomfunc.intersect(lhsrowwords[0], rhsrowwords[0]) lhswords = lhsrowwords[1] rhswords = rhsrowwords[1] words = {} _BEFORE = self._BEFORE _AFTER = self._AFTER nearness = self.nearness for row in rows: hits = sets.Set() # this is O(n*n) and could be improved # find all hits and then remove duplicates for left in lhswords[row]: for right in rhswords[row]: if (mask & _BEFORE) and right - nearness <= left <= right: hits.add(left) hits.add(right) if (mask & _AFTER) and right <= left <= right + nearness: hits.add(left) hits.add(right) if hits: hits = list(hits) hits.sort() words[row] = Numeric.array(hits, Numeric.Int) # remove rows that have no hits left rows = Numeric.array(filter(lambda r: r in words, rows), Numeric.Int) return rows, words
def yield_rows(self): row = SummaryRow() isect_time = 0.0 for item in Utils.combinations( *self.condcols.veckey_pairs(self.zeros)): row.level = len(item) row.suppress = False if row.level in self.levels: row.type_string = ['0'] * len(self.condcols) colnames = [] colvalues = [] intersect_rows = [] for var_val, var_rows, suppress, condcol in item: row.type_string[condcol.index] = '1' intersect_rows.append(var_rows) colnames.append(condcol.name) colvalues.append(var_val) if suppress: row.suppress = True isect_start = time.time() if len(intersect_rows) == 0: row.count = len(self.filtered_ds) row.extract = self.filtered_ds else: if len(intersect_rows) == 1: cellrows = intersect_rows[0] else: cellrows = soomfunc.intersect(*intersect_rows) row.count = len(cellrows) row.extract = DatasetTake(self.dataset, cellrows) isect_time += time.time() - isect_start row.colnames = tuple(colnames) row.colvalues = tuple(colvalues) yield row soom.info('Summarise intersect() time: %.3f' % isect_time)
def factor(self): comparison = self.comparison() f = comparison while self._peek('"and"', '"or"', 'END', '"\\\\)"') == '"and"': self._scan('"and"') comparison = self.comparison() f = soomfunc.intersect(f, comparison) return f
def intersect(self, lhsrowwords, rhsrowwords): # find matching rows rows = soomfunc.intersect(lhsrowwords[0], rhsrowwords[0]) lhswords = lhsrowwords[1] rhswords = rhsrowwords[1] # almalgamate word hits words = {} for r in rows: words[r] = soomfunc.union(lhswords[r], rhswords[r]) return rows, words
def op_between(self, value, filter_keys): try: start, end = value except (ValueError, TypeError): raise ExpressionError('between(start, end)') if type(self.data) is MA.MaskedArray: resmap_ge = greater_equal(self.data, start).filled() resmap_lt = less(self.data, end).filled() else: resmap_ge = Numeric.greater_equal(self.data, start) resmap_lt = Numeric.less(self.data, end) vectors = soomfunc.intersect(Numeric.nonzero(resmap_ge), NUmeric.nonzero(resmap_lt)) return vectors
def follow(self, lhsrowwords, rhsrowwords): # find matching rows rows = soomfunc.intersect(lhsrowwords[0], rhsrowwords[0]) lhswords = lhsrowwords[1] rhswords = rhsrowwords[1] # almalgamate word hits words = {} for row in rows: hits = sets.Set() # this is O(n*n) and could be improved # find all hits and remove duplicates for left in lhswords[row]: for right in rhswords[row]: if right == left + 1: hits.add(left) hits.add(right) if hits: hits = list(hits) hits.sort() words[row] = Numeric.array(hits, Numeric.Int) # remove rows that have no hits left rows = Numeric.array(filter(lambda r: r in words, rows), Numeric.Int) return rows, words
def get_inverted(self): inverted = {} for value in self._src_col.inverted.keys(): inverted[value] = soomfunc.intersect(self._src_col.inverted[value], self._all_record_ids) return inverted
print " >%8d: %d" % (pow(10, v) - 1, c) if 1: calls = 0 result = [] vl = stats('vector length distribution') cvl = stats('cache vector length distribution') for veckey in combinations(*values): if len(veckey) > 0: values, rows = zip(*veckey) if len(rows) == 1: rows = rows[0] else: calls += 1 rows = intersect(*rows) vl.add(len(rows)) if len(values) > 1 and len(values) < len(cols): cvl.add(len(rows)) # result.append((values, rows)) elapsed = time() - st print '%d intersect calls, %.3f sec, %.3f sec per call' % ( calls, elapsed, elapsed / calls) vl.report() cvl.report() if 0: calls = 0 cache = {} for veckey in combinations(*values): if len(veckey) > 0:
print " >%8d: %d" % (pow(10, v) - 1, c) if 1: calls = 0 result = [] vl = stats('vector length distribution') cvl = stats('cache vector length distribution') for veckey in combinations(*values): if len(veckey) > 0: values, rows = zip(*veckey) if len(rows) == 1: rows = rows[0] else: calls += 1 rows = intersect(*rows) vl.add(len(rows)) if len(values) > 1 and len(values) < len(cols): cvl.add(len(rows)) # result.append((values, rows)) elapsed = time() - st print '%d intersect calls, %.3f sec, %.3f sec per call' % (calls, elapsed, elapsed / calls) vl.report() cvl.report() if 0: calls = 0 cache = {} for veckey in combinations(*values): if len(veckey) > 0:
def isect(a, b): # redundant, I think. return soomfunc.intersect(vector1_val1, vector2_val2)