Ejemplo n.º 1
0
class Regroup(Transform):
    """Take the first recordset's grouping and enforce it on another's.
    NOTE: This assumes this can work! No validation (yet) is done.
      Rather, this is a way to flag that two recordsets are aligned.
    """
    ScanClass = (ReplayingGroupScanner, ReplayingRecordScanner)

    def __init__(self, source, target, *args, **kwargs):
        # Initialize mixins
        super(Regroup, self).__init__(*args, **kwargs)

        self.sources = (source, target)
        self._resultset = RecordSet(recordType=target._RecordType)
        self._generateScanners()

    def _generateScanners(self):
        #source, target = self.sources
        #self.scanners = (GroupScanner(source), RecordScanner(target))
        self.scanners = tuple(
            sc(s) for sc, s in zip(self.ScanClass, self.sources))

    def transform(self):
        source, target = self.scanners

        for group in source:
            newGroup = tuple(record for _, record in zip(group, target))
            if len(newGroup) == len(group):
                self._resultset.extend((newGroup, ))
                source.anchor()
                target.anchor()
            else:
                break
Ejemplo n.º 2
0
    def __init__(self, source, target, *args, **kwargs):
        # Initialize mixins
        super(Regroup, self).__init__(*args, **kwargs)

        self.sources = (source, target)
        self._resultset = RecordSet(recordType=target._RecordType)
        self._generateScanners()
Ejemplo n.º 3
0
    def test_basic(self):

        function = lambda a, b: sum(a) - sum(b)

        srs = RecordSet(simpleRecordSet)

        c = Aggregate([srs], function, 'c')

        # Calculations are lazily evaluated
        self.assertEqual(c._resultset._groups, [])

        # When evaluated, we get the following
        self.assertEqual([[v.c for v in group] for group in c.results.groups],
                         [[41]])

        srs.extend(simpleAddition)

        # adding data from a source doesn't immediately update
        self.assertEqual(len(c._resultset._groups), 1)

        # but upon evaluation we see an update has been applied
        # note that results are always one group
        self.assertEqual([[v.c for v in group] for group in c.results.groups],
                         [[119]])

        # Demonstrate slicing for columns works as expected
        self.assertEqual([tuple(group) for group in c.results['c', :]],
                         [(119, )])
Ejemplo n.º 4
0
    def __init__(self, source, *args, **kwargs):
        # Initialize mixins
        super(Pivot, self).__init__(*args, **kwargs)

        self.sources = (source, )
        self._resultset = RecordSet(recordType=source._RecordType)
        self.scanners = (self.ScanClass(self.sources[0]), )
Ejemplo n.º 5
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        scanner = GroupScanner(srs, 'a')

        # Scanners are like generators...
        self.assertEqual([[record._tuple for record in group]
                          for group in scanner],
                         [[(1, 0), (2, 1), (3, 0),
                           (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1),
                                                       (9, 0)]])

        # ... and will exhaust when fully consumed
        self.assertEqual([group for group in scanner], [])

        srs.extend(simpleAddition)

        # adding data means the scanner consumes the new data
        self.assertEqual([[record._tuple for record in group]
                          for group in scanner],
                         [[(11, 1), (12, 0),
                           (13, 1)], [(14, 0), (15, 1), (16, 0)]])

        # resetting the scanner means it will replay the whole dataset
        scanner.reset()

        self.assertEqual([len(group) for group in scanner], [4, 2, 3, 3, 3])
Ejemplo n.º 6
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        scanner = ReplayingElementScanner(srs, 'a')

        # Replaying scanners are like generators...
        self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9])

        # ... but will NOT exhaust when fully consumed
        self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9])

        # partial iteration...
        self.assertEqual([v for v in sentinel(scanner, 4)], [1, 2, 3])
        # ... and an anchor...
        scanner.anchor()
        # ... resumes iteration from the anchor
        self.assertEqual([v for v in scanner], [5, 6, 7, 8, 9])

        # Anchoring after iteration stops emitting
        scanner.anchor()
        self.assertEqual([v for v in scanner], [])

        # But if the source adds more data...
        srs.extend(simpleAddition)

        # ... means the scanner consumes the new data
        self.assertEqual([v for v in scanner], [11, 12, 13, 14, 15, 16])

        # resetting the scanner means it will replay the whole dataset
        scanner.reset()

        self.assertEqual([v for v in scanner],
                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16])
Ejemplo n.º 7
0
    def _resolveSources(self):
        """Sources may overlap: if so, only take the latter."""
        rawSources = [
            source.results if isinstance(source, Composable) else source
            for source in self.sources
        ]

        allFields = []
        # Gather all the fields
        for source in rawSources:
            for field in source._RecordType._fields:
                allFields.append(field)

        scanners = []
        sourceFields = set(allFields)
        for source in reversed(rawSources):
            for field in source._RecordType._fields:
                if field in sourceFields:
                    sourceFields.remove(field)
                    scanners.append((field, self.ScanClass(source, field)))
                if not sourceFields:
                    break
            if not sourceFields:
                break

        # While we want to prioritize later sources, the fields should
        #   likely keep the same order, starting with the earlier sources.
        # see https://stackoverflow.com/a/12814719/1943640
        scanners.sort(key=lambda entry: allFields.index(entry[0]))

        self.scanners = tuple(scanner for field, scanner in scanners)
        self._resultset = RecordSet(recordType=genRecordType(
            field for field, scanner in scanners))
Ejemplo n.º 8
0
    def test_basic(self):

        function = lambda a, b: a + b

        srs = RecordSet(simpleRecordSet)

        c = Sweep([srs], function, 'c')

        # Calculations are lazily evaluated
        self.assertEqual(c._resultset._groups, [])

        # When evaluated, we get the following
        self.assertEqual([[v.c for v in group] for group in c.results.groups],
                         [[1, 3, 3, 5, 5, 7, 7, 9, 9]])

        srs.extend(simpleAddition)

        # adding data from a source doesn't immediately update
        self.assertEqual(len(c._resultset._groups), 1)

        # but upon evaluation we see an update has been applied
        # note that this is ONE update - groups are not maintained
        self.assertEqual(
            [[v.c for v in group] for group in c.results.groups],
            [[1, 3, 3, 5, 5, 7, 7, 9, 9], [12, 12, 14, 14, 16, 16]])

        # Demonstrate slicing for columns works as expected
        self.assertEqual([tuple(group) for group in c.results['c', :]],
                         [(1, 3, 3, 5, 5, 7, 7, 9, 9),
                          (12, 12, 14, 14, 16, 16)])
Ejemplo n.º 9
0
    def test_basic(self):

        function = lambda a, b: sum(a) - sum(b)

        srs = RecordSet(simpleRecordSet)

        c = Window([srs], function, 'c')

        # Calculations are lazily evaluated
        self.assertEqual(c._resultset._groups, [])

        # When evaluated, we get the following
        self.assertEqual([[v.c for v in group] for group in c.results.groups],
                         [[8, 10, 23]])

        srs.extend(simpleAddition)

        # adding data from a source doesn't immediately update
        self.assertEqual(len(c._resultset._groups), 1)

        # but upon evaluation we see an update has been applied
        # note that this addition is ONE update - groups are not maintained
        self.assertEqual([[v.c for v in group] for group in c.results.groups],
                         [[8, 10, 23], [34, 44]])

        # Demonstrate slicing for columns works as expected
        self.assertEqual([tuple(group) for group in c.results['c', :]],
                         [(8, 10, 23), (34, 44)])
Ejemplo n.º 10
0
class Feed(Transform):

    __slots__ = ('_key_fields', '_source_keys')

    ScanClass = RecordScanner

    def __init__(self,
                 sources,
                 renamed_fields,
                 key_fields=tuple(),
                 *args,
                 **kwargs):
        super(Feed, self).__init__(*args, **kwargs)

        self._resultset = RecordSet(recordType=tuple(renamed_fields) +
                                    tuple(key_fields))

        self.sources = tuple()
        self.scanners = tuple()
        self._key_fields = key_fields
        self._source_keys = tuple()

        if sources:
            for source in sources:
                self.add_source(source)

    def add_source(self, source, key=None):
        if isinstance(source, Composable):
            source = source.results
        self._add_source(source)
        self.scanners += (self.ScanClass(source), )
        if not key:
            self._source_keys += (tuple(lambda record, k=key: record[key]
                                        for key in self._key_fields), )
        else:
            self._source_keys += (tuple(lambda record, k=key_value: k
                                        for key_value in key), )

    def del_source(self, source):
        if isinstance(source, Composable):
            source = source.results
        self._del_source(source)
        ix_to_remove = set()
        for ix, (scanner, key_functions) in enumerate(
                zip(self.scanners, self._source_keys)):
            if scanner.source is source:
                ix_to_remove.append(ix)

        self.scanners = tuple(s for ix, s in enumerate(self.scanners)
                              if not ix in ix_to_remove)
        self._source_keys = tuple(k for ix, k in enumerate(self._source_keys)
                                  if not ix in ix_to_remove)

    def transform(self):

        self._resultset.extend(
            # use a generator to avoid adding empty entries
            (tuple(record) + tuple(getter(record) for getter in source_keys)
             for record in scanner)
            for scanner, source_keys in zip(self.scanners, self._source_keys))
Ejemplo n.º 11
0
class LagBucket(Transform):
    """Creates a new recordset of tuples of each of the given records,
    with the first being from `lag` records back.
    """
    __slots__ = ('_lag', '_lagRecords')
    ScanClass = RecordScanner
    
    def __init__(self, source, lag=1, *args, **kwargs):
        #Initialize mixins
        super(LagBucket, self).__init__(*args, **kwargs)
        
        self._lag = lag
        self.sources = (source,)
        self._resultset = RecordSet(recordType=source._RecordType)
        self.scanners = (self.ScanClass(source),)
        self._lagRecords = []
        
    def transform(self):
        while len(self._lagRecords) < self._lag:
            self._lagRecords.append(next(self.scanners[0]))
        else:
            for record in self.scanners[0]:
                prev = self._lagRecords.pop(0)
                self._resultset.append(
                    # cast to the record early so the tuples are not misunderstood
                        tuple(
                            (last,this)
                            for last,this
                            in zip(prev._tuple,record._tuple) ) )
                self._lagRecords.append(record)
Ejemplo n.º 12
0
    def __init__(self, source, key_field, *args, **kwargs):
        # Initialize mixins
        super(Cluster, self).__init__(*args, **kwargs)

        self._key_field = key_field
        self.sources = (source, )
        self._resultset = RecordSet(recordType=source._RecordType)
        self.scanners = (self.ScanClass(source), )
Ejemplo n.º 13
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        scanner = ReplayingGroupScanner(srs)

        # Replaying scanners are like generators...
        self.assertEqual([[r._tuple for r in g] for g in scanner],
                         [[(1, 0), (2, 1), (3, 0),
                           (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1),
                                                       (9, 0)]])

        # ... but will NOT exhaust when fully consumed
        self.assertEqual([[r._tuple for r in g] for g in scanner],
                         [[(1, 0), (2, 1), (3, 0),
                           (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1),
                                                       (9, 0)]])

        # Anchor after next(iter)
        self.assertEqual([r._tuple for r in next(scanner)], [(1, 0), (2, 1),
                                                             (3, 0), (4, 1)])
        scanner.anchor()

        self.assertEqual([[r._tuple for r in g] for g in scanner],
                         [[(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]])

        scanner.reset()

        # partial iteration with an anchor
        for i, r in enumerate(scanner):
            if i < 2:
                scanner.anchor()
                break

        self.assertEqual([[r._tuple for r in g] for g in scanner],
                         [[(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)]])

        # Anchoring after iteration stops emitting
        scanner.anchor()
        self.assertEqual([v for v in scanner], [])

        # But if the source adds more data...
        srs.extend(simpleAddition)

        # ... means the scanner consumes the new data
        self.assertEqual([[r._tuple for r in g] for g in scanner],
                         [[(11, 1), (12, 0),
                           (13, 1)], [(14, 0), (15, 1), (16, 0)]])

        # resetting the scanner means it will replay the whole dataset
        scanner.reset()

        self.assertEqual([[r._tuple for r in g] for g in scanner],
                         [[(1, 0), (2, 1), (3, 0),
                           (4, 1)], [(5, 0), (6, 1)], [(7, 0), (8, 1), (9, 0)],
                          [(11, 1), (12, 0),
                           (13, 1)], [(14, 0), (15, 1), (16, 0)]])
Ejemplo n.º 14
0
 def __init__(self, source, lag=1, *args, **kwargs):
     #Initialize mixins
     super(LagBucket, self).__init__(*args, **kwargs)
     
     self._lag = lag
     self.sources = (source,)
     self._resultset = RecordSet(recordType=source._RecordType)
     self.scanners = (self.ScanClass(source),)
     self._lagRecords = []
Ejemplo n.º 15
0
class Merge(Transform):
    """Combine the source recordsets into one recordset.
    The new record type will have all the source columns,
      with the caveat that later sources win for overlaps.
    """
    ScanClass = ElementScanner

    def __init__(self, sources, *args, **kwargs):
        # Initialize mixins
        super(Merge, self).__init__(*args, **kwargs)
        self.sources = tuple(sources)
        self._resolveSources()

    def _resolveSources(self):
        """Sources may overlap: if so, only take the latter."""
        rawSources = [
            source.results if isinstance(source, Composable) else source
            for source in self.sources
        ]

        allFields = []
        # Gather all the fields
        for source in rawSources:
            for field in source._RecordType._fields:
                allFields.append(field)

        scanners = []
        sourceFields = set(allFields)
        for source in reversed(rawSources):
            for field in source._RecordType._fields:
                if field in sourceFields:
                    sourceFields.remove(field)
                    scanners.append((field, self.ScanClass(source, field)))
                if not sourceFields:
                    break
            if not sourceFields:
                break

        # While we want to prioritize later sources, the fields should
        #   likely keep the same order, starting with the earlier sources.
        # see https://stackoverflow.com/a/12814719/1943640
        scanners.sort(key=lambda entry: allFields.index(entry[0]))

        self.scanners = tuple(scanner for field, scanner in scanners)
        self._resultset = RecordSet(recordType=genRecordType(
            field for field, scanner in scanners))

    def transform(self):
        """Simply scan down the sources, generating new records."""
        self._resultset.append(
            tuple(
                self._resultset.coerceRecordType(newRecordValues)
                for newRecordValues in zip(*self.scanners)))
Ejemplo n.º 16
0
    def _resolveSources(self):

        rawSources = [
            source.results if isinstance(source, Composable) else source
            for source in self.sources
        ]

        scanner_coverage = {}
        scanners = []
        preset_targets = set(self._target_fields or [])
        covered_fields = set()

        # Gather all the fields
        for source in rawSources:
            covered_fields = set()
            for field in source._RecordType._fields:
                if field in (
                        self._key_field,
                        self._collation_field,
                ):
                    continue

                if not preset_targets or field in preset_targets:
                    covered_fields.add(field)

            # skip nops
            if not covered_fields:
                continue

            scanner = RecordScanner(source)
            scanner_coverage[scanner] = covered_fields
            scanners.append(scanner)

        all_covered_fields = set()
        for covered_fields in scanner_coverage.values():
            all_covered_fields.update(covered_fields)

        if preset_targets:
            assert preset_targets == all_covered_fields, 'Sources do not cover the target fields: given: %r -- covered: %r' % (
                preset_targets, covered_fields)
            target_fields = tuple(self._target_fields)
        else:
            target_fields = tuple(field for field in all_covered_fields)

        self._target_fields = target_fields

        self._scanner_coverage = scanner_coverage

        self.scanners = tuple(scanners)
        self._resultset = RecordSet(
            recordType=((self._key_field, ) + self._target_fields +
                        ((self._collation_field, ) or tuple())))
Ejemplo n.º 17
0
	def test_basic(self):

		rsa = [(1,2,3,4),(5,6),(7,8,9)]
		rsb = [(0,1,0,1),(0,1),(0,1,0)]
		rsc = [(1,2,3),(4,),(5,6,7,8,9)]
		rsd = [(0,1,0),(1,),(0,1,0,1,0)]

		# source
		rss = RecordSet(recordType='ab')
		for g in zip(rsa,rsb):
		    rss.append(v for v in zip(*g))

		# target
		rst = RecordSet(recordType='ef')
		for g in zip(rsc,rsd):
		    rst.append(v for v in zip(*g))

		regroup = Regroup(rss, rst)

		# verify it has the same columns as the target
		self.assertEqual(
			regroup.results._RecordType._fields,
			('e', 'f')
			)

		self.assertEqual(
			[[record._tuple for record in group] for group in regroup],
			[[(1, 0), (2, 1), (3, 0), (4, 1)], 
			 [(5, 0), (6, 1)], 
			 [(7, 0), (8, 1), (9, 0)]] 
			)
Ejemplo n.º 18
0
    def test_basic(self):

        rsa = [(1, 2, 3, 4), (5, 6), (7, 8, 9)]
        rsb = [(0, 1, 0, 1), (0, 1), (0, 1, 0)]
        rsc = [(9, 8, 7, 6, 5), (4, 3, 2), (1, )]
        rsd = [(1, 0, 1, 0, 1), (0, 1, 0), (1, )]

        rs1 = RecordSet(recordType='ab')
        for g in zip(rsa, rsb):
            rs1.append(v for v in zip(*g))

        rs2 = RecordSet(recordType='cb')
        for g in zip(rsc, rsd):
            rs2.append(v for v in zip(*g))

        merge = Merge([rs1, rs2])

        self.assertEqual(merge.results._RecordType._fields, ('a', 'b', 'c'))

        self.assertEqual(merge.results._groups[0][0]._tuple, (1, 1, 9))

        self.assertEqual([[record._tuple for record in group]
                          for group in merge],
                         [[(1, 1, 9), (2, 0, 8), (3, 1, 7), (4, 0, 6),
                           (5, 1, 5), (6, 0, 4), (7, 1, 3), (8, 0, 2),
                           (9, 1, 1)]])
Ejemplo n.º 19
0
class Cluster(Transform):
    """Group records by a key_field value. Useful for bunching data together for aggregation.
    """

    __slots__ = ('_key_field', )

    ScanClass = RecordScanner

    def __init__(self, source, key_field, *args, **kwargs):
        # Initialize mixins
        super(Cluster, self).__init__(*args, **kwargs)

        self._key_field = key_field
        self.sources = (source, )
        self._resultset = RecordSet(recordType=source._RecordType)
        self.scanners = (self.ScanClass(source), )

    def transform(self):

        last_key_value = None

        groups = []
        group = []

        if self._resultset._groups:
            last_key_value = self._resultset._groups[-1][-1][self._key_field]

            # loop one: fill for continuity
            for entry in self.scanners[0]:
                if entry[self._key_field] == last_key_value:
                    self._resultset._groups[-1] += (entry, )
                else:
                    group = [entry]
                    last_key_value = entry[self._key_field]
                    break

        for entry in self.scanners[0]:
            if entry[self._key_field] == last_key_value:
                group.append(entry)
            else:
                groups.append(group)
                group = [entry]
                last_key_value = entry[self._key_field]
        else:
            groups.append(group)

        self._resultset.extend(groups)
Ejemplo n.º 20
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        scanner = ReplayingRecordScanner(srs)

        # Replaying scanners are like generators...
        self.assertEqual([r._tuple for r in scanner], [(1, 0), (2, 1), (3, 0),
                                                       (4, 1), (5, 0), (6, 1),
                                                       (7, 0), (8, 1), (9, 0)])

        # ... but will NOT exhaust when fully consumed
        self.assertEqual([r._tuple for r in scanner], [(1, 0), (2, 1), (3, 0),
                                                       (4, 1), (5, 0), (6, 1),
                                                       (7, 0), (8, 1), (9, 0)])

        # partial iteration...
        for i, r in enumerate(scanner):
            if i >= 3:
                scanner.anchor()  # anchor in the iteration
                break
        # ... and an anchor...
        scanner.anchor()
        # ... resumes iteration from the anchor
        self.assertEqual([r._tuple for r in scanner], [(5, 0), (6, 1), (7, 0),
                                                       (8, 1), (9, 0)])

        # Anchoring after iteration stops emitting
        scanner.anchor()
        self.assertEqual([r._tuple for r in scanner], [])

        # But if the source adds more data...
        srs.extend(simpleAddition)

        # ... means the scanner consumes the new data
        self.assertEqual([r._tuple for r in scanner],
                         [(11, 1), (12, 0), (13, 1), (14, 0), (15, 1),
                          (16, 0)])

        # resetting the scanner means it will replay the whole dataset
        scanner.reset()

        self.assertEqual([r._tuple for r in scanner],
                         [(1, 0), (2, 1), (3, 0), (4, 1), (5, 0), (6, 1),
                          (7, 0), (8, 1), (9, 0), (11, 1), (12, 0), (13, 1),
                          (14, 0), (15, 1), (16, 0)])
Ejemplo n.º 21
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        scanner = ReplayingChunkScanner(srs, 'a')

        # Replaying scanners are like generators...
        self.assertEqual([v for v in scanner], [(1, 2, 3, 4), (5, 6),
                                                (7, 8, 9)])

        # ... but will NOT exhaust when fully consumed
        self.assertEqual([v for v in scanner], [(1, 2, 3, 4), (5, 6),
                                                (7, 8, 9)])

        # Anchor after next(iter)
        self.assertEqual(next(scanner), (1, 2, 3, 4))
        scanner.anchor()

        self.assertEqual([v for v in scanner], [(5, 6), (7, 8, 9)])

        scanner.reset()

        # partial iteration with an anchor
        for i, v in enumerate(scanner):
            if i < 2:
                scanner.anchor()
                break

        self.assertEqual([v for v in scanner], [(5, 6), (7, 8, 9)])

        # Anchoring after iteration stops emitting
        scanner.anchor()
        self.assertEqual([v for v in scanner], [])

        # But if the source adds more data...
        srs.extend(simpleAddition)

        # ... means the scanner consumes the new data
        self.assertEqual([v for v in scanner], [(11, 12, 13), (14, 15, 16)])

        # resetting the scanner means it will replay the whole dataset
        scanner.reset()

        self.assertEqual([v for v in scanner], [(1, 2, 3, 4), (5, 6),
                                                (7, 8, 9), (11, 12, 13),
                                                (14, 15, 16)])
Ejemplo n.º 22
0
	def test_misalignment_2(self):

		# source
		rsa = [(1,2,3,4)]
		rsb = [(0,1,0,1)]
		rss = RecordSet(recordType='ab')
		for g in zip(rsa,rsb):
		    rss.append(v for v in zip(*g))

		# target - longer
		rsa = [(1,2,3),(4,),(5,6,7),(8,9,10)]
		rsb = [(0,1,0),(1,),(0,1,0),(1,0,10)]
		rst = RecordSet(recordType='ef')
		for g in zip(rsa,rsb):
		    rst.append(v for v in zip(*g))

		regroup = Regroup(rss, rst)


		# verify it has the same columns as the target
		self.assertEqual(
			regroup.results._RecordType._fields,
			('e', 'f')
			)

		# Source only has one group, so that alone gets mapped
		self.assertEqual(
			[[record._tuple for record in group] for group in regroup],
			[[(1, 0), (2, 1), (3, 0), (4, 1)]]
			)

		# adding two more groups...
		rss.extend( [
			 ((5,0),(6,1)),
             ((7,0),(8,1),(9,0))
            ] )

		# ... allows two more groups to be added.
		# Again, note that the last target record is omitted, though, 
		#   since the source doesn't have a group to map to it
		self.assertEqual(
			[[record._tuple for record in group] for group in regroup],
			[[(1, 0), (2, 1), (3, 0), (4, 1)], 
			 [(5, 0), (6, 1)], 
			 [(7, 0), (8, 1), (9, 0)]]
			)		
Ejemplo n.º 23
0
	def test_misalignment_1(self):

		# source
		rsa = [(1,2,3,4),(5,6),(7,8,9)]
		rsb = [(0,1,0,1),(0,1),(0,1,0)]
		rss = RecordSet(recordType='ab')
		for g in zip(rsa,rsb):
		    rss.append(v for v in zip(*g))

		# target - shorter
		rsa = [(1,2,3),(4,),(5,6,7)]
		rsb = [(0,1,0),(1,),(0,1,0)]
		rst = RecordSet(recordType='ef')
		for g in zip(rsa,rsb):
		    rst.append(v for v in zip(*g))

		regroup = Regroup(rss, rst)


		# verify it has the same columns as the target
		self.assertEqual(
			regroup.results._RecordType._fields,
			('e', 'f')
			)

		# Up to 7 records can be grouped. The final source group
		#   must be incomplete, and is omitted
		self.assertEqual(
			[[record._tuple for record in group] for group in regroup],
			[[(1, 0), (2, 1), (3, 0), (4, 1)], 
			 [(5, 0), (6, 1)]]
			)

		# adding three more means the target has at least enough to complete
		rst.append( [(8,1),(9,0)] )

		# note that the last is omitted, though, 
		#   since the source doesn't have a group to map to it
		self.assertEqual(
			[[record._tuple for record in group] for group in regroup],
			[[(1, 0), (2, 1), (3, 0), (4, 1)], 
			 [(5, 0), (6, 1)], 
			 [(7, 0), (8, 1), (9, 0)]]
			)		
Ejemplo n.º 24
0
    def __init__(self,
                 sources,
                 renamed_fields,
                 key_fields=tuple(),
                 *args,
                 **kwargs):
        super(Feed, self).__init__(*args, **kwargs)

        self._resultset = RecordSet(recordType=tuple(renamed_fields) +
                                    tuple(key_fields))

        self.sources = tuple()
        self.scanners = tuple()
        self._key_fields = key_fields
        self._source_keys = tuple()

        if sources:
            for source in sources:
                self.add_source(source)
Ejemplo n.º 25
0
class Pivot(Transform):
    """Rotate groups of records into a record of lists.
    [({a:4,b:3},{a:6,b:5},{a:8,b:7}),({a:10,b:9},{a:12,b:11})]
    becomes
    [({a:(4,6,8),b:(3,5,7)}),({a:(10,12),b:(9,11)})]
    """
    ScanClass = GroupScanner

    def __init__(self, source, *args, **kwargs):
        # Initialize mixins
        super(Pivot, self).__init__(*args, **kwargs)

        self.sources = (source, )
        self._resultset = RecordSet(recordType=source._RecordType)
        self.scanners = (self.ScanClass(self.sources[0]), )

    def transform(self):
        for group in self.scanners[0]:
            self._resultset.append(
                # cast to the record early so the tuples are not misunderstood
                self._resultset.coerceRecordType(tuple(zip(*group))))
Ejemplo n.º 26
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        pivot = Pivot(srs)

        # Pivot converts a group or records into one record per group
        self.assertEqual([[record._tuple for record in group]
                          for group in pivot],
                         [[((1, 2, 3, 4),
                            (0, 1, 0, 1))], [((5, 6),
                                              (0, 1))], [((7, 8, 9),
                                                          (0, 1, 0))]])

        srs.extend(simpleAddition)

        # adding data means the transform consumes the new data when checked
        self.assertEqual(
            [[record._tuple for record in group] for group in pivot],
            [[((1, 2, 3, 4),
               (0, 1, 0, 1))], [((5, 6), (0, 1))], [((7, 8, 9), (0, 1, 0))],
             [((11, 12, 13), (1, 0, 1))], [((14, 15, 16), (0, 1, 0))]])
Ejemplo n.º 27
0
    def test_basic(self):

        srs = RecordSet(simpleRecordSet)

        scanner = ElementScanner(srs, 'a')

        # Scanners are like generators...
        self.assertEqual([v for v in scanner], [1, 2, 3, 4, 5, 6, 7, 8, 9])

        # ... and will exhaust when fully consumed
        self.assertEqual([v for v in scanner], [])

        srs.extend(simpleAddition)

        # adding data means the scanner consumes the new data
        self.assertEqual([v for v in scanner], [11, 12, 13, 14, 15, 16])

        # resetting the scanner means it will replay the whole dataset
        scanner.reset()

        self.assertEqual([v for v in scanner],
                         [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16])
Ejemplo n.º 28
0
    def __init__(self,
                 sources,
                 function,
                 outputLabels,
                 mapInputs={},
                 *args,
                 **kwargs):
        # Initialize mixins
        super(Calculation, self).__init__(*args, **kwargs)

        self._resultset = RecordSet(recordType=genRecordType(outputLabels))
        self.subscribe(self._resultset)
        self.sources = tuple(sources)

        if isinstance(function, (str, unicode)):
            self.function = Expression(function)
        else:
            self.function = function

        self._mapInputs = mapInputs
        self._resolveSources()
Ejemplo n.º 29
0
from ligature.recordset import RecordSet


def genData(columns, rows, start=0):
    if not isinstance(columns, int):
        columns = len(columns)
    if columns == 1:
        return (i for i in range(start, start + rows * columns, columns))
    else:
        return (tuple(range(i, i + columns))
                for i in range(start, start + rows * columns, columns))


a1 = [(1, 2, 3, 4), (5, 6), (7, 8, 9)]
b1 = [(0, 1, 0, 1), (0, 1), (0, 1, 0)]

a2 = [(11, 12, 13), (14, 15, 16)]
b2 = [(1, 0, 1), (0, 1, 0)]

simpleRecordSet = RecordSet(recordType='ab')
for g in zip(a1, b1):
    simpleRecordSet.append(v for v in zip(*g))

simpleAddition = RecordSet(recordType='ab')
for g in zip(a2, b2):
    simpleAddition.append(v for v in zip(*g))
Ejemplo n.º 30
0
class Collation(Transform):
    """combine the source recordsets into one recordset.
    The new record type will have all the target fields
      along with the key (and optionally a coalescing) fields.
      
    Each
    """

    __slots__ = (
        '_key_field',
        '_collation_field',
        '_target_fields',
        '_scanner_coverage',
    )

    ScanClass = RecordScanner

    def __init__(self,
                 sources,
                 key_field,
                 target_fields=None,
                 collation_field=None,
                 *args,
                 **kwargs):
        super(Collation, self).__init__(*args, **kwargs)
        self.sources = tuple(sources)

        self._key_field = key_field
        self._collation_field = collation_field
        self._target_fields = target_fields

        self._resolveSources()

    def _resolveSources(self):

        rawSources = [
            source.results if isinstance(source, Composable) else source
            for source in self.sources
        ]

        scanner_coverage = {}
        scanners = []
        preset_targets = set(self._target_fields or [])
        covered_fields = set()

        # Gather all the fields
        for source in rawSources:
            covered_fields = set()
            for field in source._RecordType._fields:
                if field in (
                        self._key_field,
                        self._collation_field,
                ):
                    continue

                if not preset_targets or field in preset_targets:
                    covered_fields.add(field)

            # skip nops
            if not covered_fields:
                continue

            scanner = RecordScanner(source)
            scanner_coverage[scanner] = covered_fields
            scanners.append(scanner)

        all_covered_fields = set()
        for covered_fields in scanner_coverage.values():
            all_covered_fields.update(covered_fields)

        if preset_targets:
            assert preset_targets == all_covered_fields, 'Sources do not cover the target fields: given: %r -- covered: %r' % (
                preset_targets, covered_fields)
            target_fields = tuple(self._target_fields)
        else:
            target_fields = tuple(field for field in all_covered_fields)

        self._target_fields = target_fields

        self._scanner_coverage = scanner_coverage

        self.scanners = tuple(scanners)
        self._resultset = RecordSet(
            recordType=((self._key_field, ) + self._target_fields +
                        ((self._collation_field, ) or tuple())))

    def transform(self):

        scanners = set(self.scanners)

        def get_next(scanner, remaining=scanners):
            try:
                entry = next(scanner)
                return entry
            except StopIteration:
                remaining.remove(scanner)
                return None

        # initial conditions
        if self._resultset:
            cursor_values = dict(
                (field, value)
                for field, value in zip(self._resultset._RecordType._fields,
                                        self._resultset._groups[-1][-1]))
        else:
            cursor_values = dict(
                (field, None) for field in self._resultset._RecordType._fields
            )  # initialize to None to ensure _some_ value for all non-key/group fields

        cursor_value_heap = []
        for scanner in frozenset(scanners):
            entry = get_next(scanner)
            if entry is not None:
                cursor_value_heap.append(
                    (entry[self._key_field], entry,
                     scanner))  # include scanner for replacement lookup later
        heapify(cursor_value_heap)

        # generate results
        merged = []

        while cursor_value_heap:

            key_value, entry, scanner = heappop(cursor_value_heap)

            for field in self._scanner_coverage[scanner]:
                cursor_values[field] = entry[field]

            cursor_values[self._key_field] = entry[self._key_field]

            if self._collation_field:
                group_value = entry[self._collation_field]

                # when grouping for merge, assume group final value is most recent by key sort value
                if cursor_values[
                        self.
                        _collation_field] == group_value and group_value is not None:
                    merged[-1] = self._resultset._RecordType(cursor_values)
                else:
                    cursor_values[self._collation_field] = group_value
                    merged.append(self._resultset._RecordType(cursor_values))
            else:
                merged.append(self._resultset._RecordType(cursor_values))

            entry = get_next(scanner)
            if entry is not None:
                heappush(cursor_value_heap,
                         (entry[self._key_field], entry, scanner))

        if merged:
            self._resultset.extend([[v for v in merged]])