class Regroup(Transform): """Take the first recordset's grouping and enforce it on another's. NOTE: This assumes this can work! No validation (yet) is done. Rather, this is a way to flag that two recordsets are aligned. """ ScanClass = (ReplayingGroupScanner, ReplayingRecordScanner) def __init__(self, source, target, *args, **kwargs): # Initialize mixins super(Regroup, self).__init__(*args, **kwargs) self.sources = (source, target) self._resultset = RecordSet(recordType=target._RecordType) self._generateScanners() def _generateScanners(self): #source, target = self.sources #self.scanners = (GroupScanner(source), RecordScanner(target)) self.scanners = tuple( sc(s) for sc, s in zip(self.ScanClass, self.sources)) def transform(self): source, target = self.scanners for group in source: newGroup = tuple(record for _, record in zip(group, target)) if len(newGroup) == len(group): self._resultset.extend((newGroup, )) source.anchor() target.anchor() else: break
def __init__(self, source, target, *args, **kwargs): # Initialize mixins super(Regroup, self).__init__(*args, **kwargs) self.sources = (source, target) self._resultset = RecordSet(recordType=target._RecordType) self._generateScanners()
def test_basic(self): function = lambda a, b: sum(a) - sum(b) srs = RecordSet(simpleRecordSet) c = Aggregate([srs], function, 'c') # Calculations are lazily evaluated self.assertEqual(c._resultset._groups, []) # When evaluated, we get the following self.assertEqual([[v.c for v in group] for group in c.results.groups], [[41]]) srs.extend(simpleAddition) # adding data from a source doesn't immediately update self.assertEqual(len(c._resultset._groups), 1) # but upon evaluation we see an update has been applied # note that results are always one group self.assertEqual([[v.c for v in group] for group in c.results.groups], [[119]]) # Demonstrate slicing for columns works as expected self.assertEqual([tuple(group) for group in c.results['c', :]], [(119, )])
def __init__(self, source, *args, **kwargs): # Initialize mixins super(Pivot, self).__init__(*args, **kwargs) self.sources = (source, ) self._resultset = RecordSet(recordType=source._RecordType) self.scanners = (self.ScanClass(self.sources[0]), )
def test_basic(self): srs = RecordSet(simpleRecordSet) scanner = GroupScanner(srs, 'a') # Scanners are like generators... self.assertEqual([[record._tuple for record in group] for group in scanner], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]]) # ... and will exhaust when fully consumed self.assertEqual([group for group in scanner], []) srs.extend(simpleAddition) # adding data means the scanner consumes the new data self.assertEqual([[record._tuple for record in group] for group in scanner], [[(11, 1), (12, 0), (13, 1)], [(14, 0), (15, 1), (16, 0)]]) # resetting the scanner means it will replay the whole dataset scanner.reset() self.assertEqual([len(group) for group in scanner], [4, 2, 3, 3, 3])
def test_basic(self): srs = RecordSet(simpleRecordSet) scanner = ReplayingElementScanner(srs, 'a') # Replaying scanners are like generators... self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9]) # ... but will NOT exhaust when fully consumed self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9]) # partial iteration... self.assertEqual([v for v in sentinel(scanner, 4)], [1, 2, 3]) # ... and an anchor... scanner.anchor() # ... resumes iteration from the anchor self.assertEqual([v for v in scanner], [5, 6, 7, 8, 9]) # Anchoring after iteration stops emitting scanner.anchor() self.assertEqual([v for v in scanner], []) # But if the source adds more data... srs.extend(simpleAddition) # ... means the scanner consumes the new data self.assertEqual([v for v in scanner], [11, 12, 13, 14, 15, 16]) # resetting the scanner means it will replay the whole dataset scanner.reset() self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16])
def _resolveSources(self): """Sources may overlap: if so, only take the latter.""" rawSources = [ source.results if isinstance(source, Composable) else source for source in self.sources ] allFields = [] # Gather all the fields for source in rawSources: for field in source._RecordType._fields: allFields.append(field) scanners = [] sourceFields = set(allFields) for source in reversed(rawSources): for field in source._RecordType._fields: if field in sourceFields: sourceFields.remove(field) scanners.append((field, self.ScanClass(source, field))) if not sourceFields: break if not sourceFields: break # While we want to prioritize later sources, the fields should # likely keep the same order, starting with the earlier sources. # see https://stackoverflow.com/a/12814719/1943640 scanners.sort(key=lambda entry: allFields.index(entry[0])) self.scanners = tuple(scanner for field, scanner in scanners) self._resultset = RecordSet(recordType=genRecordType( field for field, scanner in scanners))
def test_basic(self): function = lambda a, b: a + b srs = RecordSet(simpleRecordSet) c = Sweep([srs], function, 'c') # Calculations are lazily evaluated self.assertEqual(c._resultset._groups, []) # When evaluated, we get the following self.assertEqual([[v.c for v in group] for group in c.results.groups], [[1, 3, 3, 5, 5, 7, 7, 9, 9]]) srs.extend(simpleAddition) # adding data from a source doesn't immediately update self.assertEqual(len(c._resultset._groups), 1) # but upon evaluation we see an update has been applied # note that this is ONE update - groups are not maintained self.assertEqual( [[v.c for v in group] for group in c.results.groups], [[1, 3, 3, 5, 5, 7, 7, 9, 9], [12, 12, 14, 14, 16, 16]]) # Demonstrate slicing for columns works as expected self.assertEqual([tuple(group) for group in c.results['c', :]], [(1, 3, 3, 5, 5, 7, 7, 9, 9), (12, 12, 14, 14, 16, 16)])
def test_basic(self): function = lambda a, b: sum(a) - sum(b) srs = RecordSet(simpleRecordSet) c = Window([srs], function, 'c') # Calculations are lazily evaluated self.assertEqual(c._resultset._groups, []) # When evaluated, we get the following self.assertEqual([[v.c for v in group] for group in c.results.groups], [[8, 10, 23]]) srs.extend(simpleAddition) # adding data from a source doesn't immediately update self.assertEqual(len(c._resultset._groups), 1) # but upon evaluation we see an update has been applied # note that this addition is ONE update - groups are not maintained self.assertEqual([[v.c for v in group] for group in c.results.groups], [[8, 10, 23], [34, 44]]) # Demonstrate slicing for columns works as expected self.assertEqual([tuple(group) for group in c.results['c', :]], [(8, 10, 23), (34, 44)])
class Feed(Transform): __slots__ = ('_key_fields', '_source_keys') ScanClass = RecordScanner def __init__(self, sources, renamed_fields, key_fields=tuple(), *args, **kwargs): super(Feed, self).__init__(*args, **kwargs) self._resultset = RecordSet(recordType=tuple(renamed_fields) + tuple(key_fields)) self.sources = tuple() self.scanners = tuple() self._key_fields = key_fields self._source_keys = tuple() if sources: for source in sources: self.add_source(source) def add_source(self, source, key=None): if isinstance(source, Composable): source = source.results self._add_source(source) self.scanners += (self.ScanClass(source), ) if not key: self._source_keys += (tuple(lambda record, k=key: record[key] for key in self._key_fields), ) else: self._source_keys += (tuple(lambda record, k=key_value: k for key_value in key), ) def del_source(self, source): if isinstance(source, Composable): source = source.results self._del_source(source) ix_to_remove = set() for ix, (scanner, key_functions) in enumerate( zip(self.scanners, self._source_keys)): if scanner.source is source: ix_to_remove.append(ix) self.scanners = tuple(s for ix, s in enumerate(self.scanners) if not ix in ix_to_remove) self._source_keys = tuple(k for ix, k in enumerate(self._source_keys) if not ix in ix_to_remove) def transform(self): self._resultset.extend( # use a generator to avoid adding empty entries (tuple(record) + tuple(getter(record) for getter in source_keys) for record in scanner) for scanner, source_keys in zip(self.scanners, self._source_keys))
class LagBucket(Transform): """Creates a new recordset of tuples of each of the given records, with the first being from `lag` records back. """ __slots__ = ('_lag', '_lagRecords') ScanClass = RecordScanner def __init__(self, source, lag=1, *args, **kwargs): #Initialize mixins super(LagBucket, self).__init__(*args, **kwargs) self._lag = lag self.sources = (source,) self._resultset = RecordSet(recordType=source._RecordType) self.scanners = (self.ScanClass(source),) self._lagRecords = [] def transform(self): while len(self._lagRecords) < self._lag: self._lagRecords.append(next(self.scanners[0])) else: for record in self.scanners[0]: prev = self._lagRecords.pop(0) self._resultset.append( # cast to the record early so the tuples are not misunderstood tuple( (last,this) for last,this in zip(prev._tuple,record._tuple) ) ) self._lagRecords.append(record)
def __init__(self, source, key_field, *args, **kwargs): # Initialize mixins super(Cluster, self).__init__(*args, **kwargs) self._key_field = key_field self.sources = (source, ) self._resultset = RecordSet(recordType=source._RecordType) self.scanners = (self.ScanClass(source), )
def test_basic(self): srs = RecordSet(simpleRecordSet) scanner = ReplayingGroupScanner(srs) # Replaying scanners are like generators... self.assertEqual([[r._tuple for r in g] for g in scanner], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]]) # ... but will NOT exhaust when fully consumed self.assertEqual([[r._tuple for r in g] for g in scanner], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]]) # Anchor after next(iter) self.assertEqual([r._tuple for r in next(scanner)], [(1, 0), (2, 1), (3, 0), (4, 1)]) scanner.anchor() self.assertEqual([[r._tuple for r in g] for g in scanner], [[(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]]) scanner.reset() # partial iteration with an anchor for i, r in enumerate(scanner): if i < 2: scanner.anchor() break self.assertEqual([[r._tuple for r in g] for g in scanner], [[(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]]) # Anchoring after iteration stops emitting scanner.anchor() self.assertEqual([v for v in scanner], []) # But if the source adds more data... srs.extend(simpleAddition) # ... means the scanner consumes the new data self.assertEqual([[r._tuple for r in g] for g in scanner], [[(11, 1), (12, 0), (13, 1)], [(14, 0), (15, 1), (16, 0)]]) # resetting the scanner means it will replay the whole dataset scanner.reset() self.assertEqual([[r._tuple for r in g] for g in scanner], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)], [(11, 1), (12, 0), (13, 1)], [(14, 0), (15, 1), (16, 0)]])
def __init__(self, source, lag=1, *args, **kwargs): #Initialize mixins super(LagBucket, self).__init__(*args, **kwargs) self._lag = lag self.sources = (source,) self._resultset = RecordSet(recordType=source._RecordType) self.scanners = (self.ScanClass(source),) self._lagRecords = []
class Merge(Transform): """Combine the source recordsets into one recordset. The new record type will have all the source columns, with the caveat that later sources win for overlaps. """ ScanClass = ElementScanner def __init__(self, sources, *args, **kwargs): # Initialize mixins super(Merge, self).__init__(*args, **kwargs) self.sources = tuple(sources) self._resolveSources() def _resolveSources(self): """Sources may overlap: if so, only take the latter.""" rawSources = [ source.results if isinstance(source, Composable) else source for source in self.sources ] allFields = [] # Gather all the fields for source in rawSources: for field in source._RecordType._fields: allFields.append(field) scanners = [] sourceFields = set(allFields) for source in reversed(rawSources): for field in source._RecordType._fields: if field in sourceFields: sourceFields.remove(field) scanners.append((field, self.ScanClass(source, field))) if not sourceFields: break if not sourceFields: break # While we want to prioritize later sources, the fields should # likely keep the same order, starting with the earlier sources. # see https://stackoverflow.com/a/12814719/1943640 scanners.sort(key=lambda entry: allFields.index(entry[0])) self.scanners = tuple(scanner for field, scanner in scanners) self._resultset = RecordSet(recordType=genRecordType( field for field, scanner in scanners)) def transform(self): """Simply scan down the sources, generating new records.""" self._resultset.append( tuple( self._resultset.coerceRecordType(newRecordValues) for newRecordValues in zip(*self.scanners)))
def _resolveSources(self): rawSources = [ source.results if isinstance(source, Composable) else source for source in self.sources ] scanner_coverage = {} scanners = [] preset_targets = set(self._target_fields or []) covered_fields = set() # Gather all the fields for source in rawSources: covered_fields = set() for field in source._RecordType._fields: if field in ( self._key_field, self._collation_field, ): continue if not preset_targets or field in preset_targets: covered_fields.add(field) # skip nops if not covered_fields: continue scanner = RecordScanner(source) scanner_coverage[scanner] = covered_fields scanners.append(scanner) all_covered_fields = set() for covered_fields in scanner_coverage.values(): all_covered_fields.update(covered_fields) if preset_targets: assert preset_targets == all_covered_fields, 'Sources do not cover the target fields: given: %r -- covered: %r' % ( preset_targets, covered_fields) target_fields = tuple(self._target_fields) else: target_fields = tuple(field for field in all_covered_fields) self._target_fields = target_fields self._scanner_coverage = scanner_coverage self.scanners = tuple(scanners) self._resultset = RecordSet( recordType=((self._key_field, ) + self._target_fields + ((self._collation_field, ) or tuple())))
def test_basic(self): rsa = [(1,2,3,4),(5,6),(7,8,9)] rsb = [(0,1,0,1),(0,1),(0,1,0)] rsc = [(1,2,3),(4,),(5,6,7,8,9)] rsd = [(0,1,0),(1,),(0,1,0,1,0)] # source rss = RecordSet(recordType='ab') for g in zip(rsa,rsb): rss.append(v for v in zip(*g)) # target rst = RecordSet(recordType='ef') for g in zip(rsc,rsd): rst.append(v for v in zip(*g)) regroup = Regroup(rss, rst) # verify it has the same columns as the target self.assertEqual( regroup.results._RecordType._fields, ('e', 'f') ) self.assertEqual( [[record._tuple for record in group] for group in regroup], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]] )
def test_basic(self): rsa = [(1, 2, 3, 4), (5, 6), (7, 8, 9)] rsb = [(0, 1, 0, 1), (0, 1), (0, 1, 0)] rsc = [(9, 8, 7, 6, 5), (4, 3, 2), (1, )] rsd = [(1, 0, 1, 0, 1), (0, 1, 0), (1, )] rs1 = RecordSet(recordType='ab') for g in zip(rsa, rsb): rs1.append(v for v in zip(*g)) rs2 = RecordSet(recordType='cb') for g in zip(rsc, rsd): rs2.append(v for v in zip(*g)) merge = Merge([rs1, rs2]) self.assertEqual(merge.results._RecordType._fields, ('a', 'b', 'c')) self.assertEqual(merge.results._groups[0][0]._tuple, (1, 1, 9)) self.assertEqual([[record._tuple for record in group] for group in merge], [[(1, 1, 9), (2, 0, 8), (3, 1, 7), (4, 0, 6), (5, 1, 5), (6, 0, 4), (7, 1, 3), (8, 0, 2), (9, 1, 1)]])
class Cluster(Transform): """Group records by a key_field value. Useful for bunching data together for aggregation. """ __slots__ = ('_key_field', ) ScanClass = RecordScanner def __init__(self, source, key_field, *args, **kwargs): # Initialize mixins super(Cluster, self).__init__(*args, **kwargs) self._key_field = key_field self.sources = (source, ) self._resultset = RecordSet(recordType=source._RecordType) self.scanners = (self.ScanClass(source), ) def transform(self): last_key_value = None groups = [] group = [] if self._resultset._groups: last_key_value = self._resultset._groups[-1][-1][self._key_field] # loop one: fill for continuity for entry in self.scanners[0]: if entry[self._key_field] == last_key_value: self._resultset._groups[-1] += (entry, ) else: group = [entry] last_key_value = entry[self._key_field] break for entry in self.scanners[0]: if entry[self._key_field] == last_key_value: group.append(entry) else: groups.append(group) group = [entry] last_key_value = entry[self._key_field] else: groups.append(group) self._resultset.extend(groups)
def test_basic(self): srs = RecordSet(simpleRecordSet) scanner = ReplayingRecordScanner(srs) # Replaying scanners are like generators... self.assertEqual([r._tuple for r in scanner], [(1, 0), (2, 1), (3, 0), (4, 1), (5, 0), (6, 1), (7, 0), (8, 1), (9, 0)]) # ... but will NOT exhaust when fully consumed self.assertEqual([r._tuple for r in scanner], [(1, 0), (2, 1), (3, 0), (4, 1), (5, 0), (6, 1), (7, 0), (8, 1), (9, 0)]) # partial iteration... for i, r in enumerate(scanner): if i >= 3: scanner.anchor() # anchor in the iteration break # ... and an anchor... scanner.anchor() # ... resumes iteration from the anchor self.assertEqual([r._tuple for r in scanner], [(5, 0), (6, 1), (7, 0), (8, 1), (9, 0)]) # Anchoring after iteration stops emitting scanner.anchor() self.assertEqual([r._tuple for r in scanner], []) # But if the source adds more data... srs.extend(simpleAddition) # ... means the scanner consumes the new data self.assertEqual([r._tuple for r in scanner], [(11, 1), (12, 0), (13, 1), (14, 0), (15, 1), (16, 0)]) # resetting the scanner means it will replay the whole dataset scanner.reset() self.assertEqual([r._tuple for r in scanner], [(1, 0), (2, 1), (3, 0), (4, 1), (5, 0), (6, 1), (7, 0), (8, 1), (9, 0), (11, 1), (12, 0), (13, 1), (14, 0), (15, 1), (16, 0)])
def test_basic(self): srs = RecordSet(simpleRecordSet) scanner = ReplayingChunkScanner(srs, 'a') # Replaying scanners are like generators... self.assertEqual([v for v in scanner], [(1, 2, 3, 4), (5, 6), (7, 8, 9)]) # ... but will NOT exhaust when fully consumed self.assertEqual([v for v in scanner], [(1, 2, 3, 4), (5, 6), (7, 8, 9)]) # Anchor after next(iter) self.assertEqual(next(scanner), (1, 2, 3, 4)) scanner.anchor() self.assertEqual([v for v in scanner], [(5, 6), (7, 8, 9)]) scanner.reset() # partial iteration with an anchor for i, v in enumerate(scanner): if i < 2: scanner.anchor() break self.assertEqual([v for v in scanner], [(5, 6), (7, 8, 9)]) # Anchoring after iteration stops emitting scanner.anchor() self.assertEqual([v for v in scanner], []) # But if the source adds more data... srs.extend(simpleAddition) # ... means the scanner consumes the new data self.assertEqual([v for v in scanner], [(11, 12, 13), (14, 15, 16)]) # resetting the scanner means it will replay the whole dataset scanner.reset() self.assertEqual([v for v in scanner], [(1, 2, 3, 4), (5, 6), (7, 8, 9), (11, 12, 13), (14, 15, 16)])
def test_misalignment_2(self): # source rsa = [(1,2,3,4)] rsb = [(0,1,0,1)] rss = RecordSet(recordType='ab') for g in zip(rsa,rsb): rss.append(v for v in zip(*g)) # target - longer rsa = [(1,2,3),(4,),(5,6,7),(8,9,10)] rsb = [(0,1,0),(1,),(0,1,0),(1,0,10)] rst = RecordSet(recordType='ef') for g in zip(rsa,rsb): rst.append(v for v in zip(*g)) regroup = Regroup(rss, rst) # verify it has the same columns as the target self.assertEqual( regroup.results._RecordType._fields, ('e', 'f') ) # Source only has one group, so that alone gets mapped self.assertEqual( [[record._tuple for record in group] for group in regroup], [[(1, 0), (2, 1), (3, 0), (4, 1)]] ) # adding two more groups... rss.extend( [ ((5,0),(6,1)), ((7,0),(8,1),(9,0)) ] ) # ... allows two more groups to be added. # Again, note that the last target record is omitted, though, # since the source doesn't have a group to map to it self.assertEqual( [[record._tuple for record in group] for group in regroup], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]] )
def test_misalignment_1(self): # source rsa = [(1,2,3,4),(5,6),(7,8,9)] rsb = [(0,1,0,1),(0,1),(0,1,0)] rss = RecordSet(recordType='ab') for g in zip(rsa,rsb): rss.append(v for v in zip(*g)) # target - shorter rsa = [(1,2,3),(4,),(5,6,7)] rsb = [(0,1,0),(1,),(0,1,0)] rst = RecordSet(recordType='ef') for g in zip(rsa,rsb): rst.append(v for v in zip(*g)) regroup = Regroup(rss, rst) # verify it has the same columns as the target self.assertEqual( regroup.results._RecordType._fields, ('e', 'f') ) # Up to 7 records can be grouped. The final source group # must be incomplete, and is omitted self.assertEqual( [[record._tuple for record in group] for group in regroup], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)]] ) # adding three more means the target has at least enough to complete rst.append( [(8,1),(9,0)] ) # note that the last is omitted, though, # since the source doesn't have a group to map to it self.assertEqual( [[record._tuple for record in group] for group in regroup], [[(1, 0), (2, 1), (3, 0), (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]] )
def __init__(self, sources, renamed_fields, key_fields=tuple(), *args, **kwargs): super(Feed, self).__init__(*args, **kwargs) self._resultset = RecordSet(recordType=tuple(renamed_fields) + tuple(key_fields)) self.sources = tuple() self.scanners = tuple() self._key_fields = key_fields self._source_keys = tuple() if sources: for source in sources: self.add_source(source)
class Pivot(Transform): """Rotate groups of records into a record of lists. [({a:4,b:3},{a:6,b:5},{a:8,b:7}),({a:10,b:9},{a:12,b:11})] becomes [({a:(4,6,8),b:(3,5,7)}),({a:(10,12),b:(9,11)})] """ ScanClass = GroupScanner def __init__(self, source, *args, **kwargs): # Initialize mixins super(Pivot, self).__init__(*args, **kwargs) self.sources = (source, ) self._resultset = RecordSet(recordType=source._RecordType) self.scanners = (self.ScanClass(self.sources[0]), ) def transform(self): for group in self.scanners[0]: self._resultset.append( # cast to the record early so the tuples are not misunderstood self._resultset.coerceRecordType(tuple(zip(*group))))
def test_basic(self): srs = RecordSet(simpleRecordSet) pivot = Pivot(srs) # Pivot converts a group or records into one record per group self.assertEqual([[record._tuple for record in group] for group in pivot], [[((1, 2, 3, 4), (0, 1, 0, 1))], [((5, 6), (0, 1))], [((7, 8, 9), (0, 1, 0))]]) srs.extend(simpleAddition) # adding data means the transform consumes the new data when checked self.assertEqual( [[record._tuple for record in group] for group in pivot], [[((1, 2, 3, 4), (0, 1, 0, 1))], [((5, 6), (0, 1))], [((7, 8, 9), (0, 1, 0))], [((11, 12, 13), (1, 0, 1))], [((14, 15, 16), (0, 1, 0))]])
def test_basic(self): srs = RecordSet(simpleRecordSet) scanner = ElementScanner(srs, 'a') # Scanners are like generators... self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9]) # ... and will exhaust when fully consumed self.assertEqual([v for v in scanner], []) srs.extend(simpleAddition) # adding data means the scanner consumes the new data self.assertEqual([v for v in scanner], [11, 12, 13, 14, 15, 16]) # resetting the scanner means it will replay the whole dataset scanner.reset() self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16])
def __init__(self, sources, function, outputLabels, mapInputs={}, *args, **kwargs): # Initialize mixins super(Calculation, self).__init__(*args, **kwargs) self._resultset = RecordSet(recordType=genRecordType(outputLabels)) self.subscribe(self._resultset) self.sources = tuple(sources) if isinstance(function, (str, unicode)): self.function = Expression(function) else: self.function = function self._mapInputs = mapInputs self._resolveSources()
from ligature.recordset import RecordSet def genData(columns, rows, start=0): if not isinstance(columns, int): columns = len(columns) if columns == 1: return (i for i in range(start, start + rows * columns, columns)) else: return (tuple(range(i, i + columns)) for i in range(start, start + rows * columns, columns)) a1 = [(1, 2, 3, 4), (5, 6), (7, 8, 9)] b1 = [(0, 1, 0, 1), (0, 1), (0, 1, 0)] a2 = [(11, 12, 13), (14, 15, 16)] b2 = [(1, 0, 1), (0, 1, 0)] simpleRecordSet = RecordSet(recordType='ab') for g in zip(a1, b1): simpleRecordSet.append(v for v in zip(*g)) simpleAddition = RecordSet(recordType='ab') for g in zip(a2, b2): simpleAddition.append(v for v in zip(*g))
class Collation(Transform): """combine the source recordsets into one recordset. The new record type will have all the target fields along with the key (and optionally a coalescing) fields. Each """ __slots__ = ( '_key_field', '_collation_field', '_target_fields', '_scanner_coverage', ) ScanClass = RecordScanner def __init__(self, sources, key_field, target_fields=None, collation_field=None, *args, **kwargs): super(Collation, self).__init__(*args, **kwargs) self.sources = tuple(sources) self._key_field = key_field self._collation_field = collation_field self._target_fields = target_fields self._resolveSources() def _resolveSources(self): rawSources = [ source.results if isinstance(source, Composable) else source for source in self.sources ] scanner_coverage = {} scanners = [] preset_targets = set(self._target_fields or []) covered_fields = set() # Gather all the fields for source in rawSources: covered_fields = set() for field in source._RecordType._fields: if field in ( self._key_field, self._collation_field, ): continue if not preset_targets or field in preset_targets: covered_fields.add(field) # skip nops if not covered_fields: continue scanner = RecordScanner(source) scanner_coverage[scanner] = covered_fields scanners.append(scanner) all_covered_fields = set() for covered_fields in scanner_coverage.values(): all_covered_fields.update(covered_fields) if preset_targets: assert preset_targets == all_covered_fields, 'Sources do not cover the target fields: given: %r -- covered: %r' % ( preset_targets, covered_fields) target_fields = tuple(self._target_fields) else: target_fields = tuple(field for field in all_covered_fields) self._target_fields = target_fields self._scanner_coverage = scanner_coverage self.scanners = tuple(scanners) self._resultset = RecordSet( recordType=((self._key_field, ) + self._target_fields + ((self._collation_field, ) or tuple()))) def transform(self): scanners = set(self.scanners) def get_next(scanner, remaining=scanners): try: entry = next(scanner) return entry except StopIteration: remaining.remove(scanner) return None # initial conditions if self._resultset: cursor_values = dict( (field, value) for field, value in zip(self._resultset._RecordType._fields, self._resultset._groups[-1][-1])) else: cursor_values = dict( (field, None) for field in self._resultset._RecordType._fields ) # initialize to None to ensure _some_ value for all non-key/group fields cursor_value_heap = [] for scanner in frozenset(scanners): entry = get_next(scanner) if entry is not None: cursor_value_heap.append( (entry[self._key_field], entry, scanner)) # include scanner for replacement lookup later heapify(cursor_value_heap) # generate results merged = [] while cursor_value_heap: key_value, entry, scanner = heappop(cursor_value_heap) for field in self._scanner_coverage[scanner]: cursor_values[field] = entry[field] cursor_values[self._key_field] = entry[self._key_field] if self._collation_field: group_value = entry[self._collation_field] # when grouping for merge, assume group final value is most recent by key sort value if cursor_values[ self. _collation_field] == group_value and group_value is not None: merged[-1] = self._resultset._RecordType(cursor_values) else: cursor_values[self._collation_field] = group_value merged.append(self._resultset._RecordType(cursor_values)) else: merged.append(self._resultset._RecordType(cursor_values)) entry = get_next(scanner) if entry is not None: heappush(cursor_value_heap, (entry[self._key_field], entry, scanner)) if merged: self._resultset.extend([[v for v in merged]])