def itermultiaggregate(source, key, aggregation): aggregation = OrderedDict(aggregation.items()) # take a copy it = iter(source) hdr = next(it) # push back header to ensure we iterate only once it = itertools.chain([hdr], it) # normalise aggregators for outfld in aggregation: agg = aggregation[outfld] if callable(agg): aggregation[outfld] = None, agg elif isinstance(agg, string_types): aggregation[outfld] = agg, list # list is default elif len(agg) == 1 and isinstance(agg[0], string_types): aggregation[outfld] = agg[0], list # list is default elif len(agg) == 1 and callable(agg[0]): aggregation[outfld] = None, agg[0] # aggregate whole rows elif len(agg) == 2: pass # no need to normalise else: raise ArgumentError('invalid aggregation: %r, %r' % (outfld, agg)) # determine output header if isinstance(key, (list, tuple)): outhdr = list(key) elif callable(key): outhdr = ['key'] else: outhdr = [key] for outfld in aggregation: outhdr.append(outfld) yield tuple(outhdr) # generate data for k, rows in rowgroupby(it, key): rows = list(rows) # may need to iterate over these more than once # handle compound key if isinstance(key, (list, tuple)): outrow = list(k) else: outrow = [k] for outfld in aggregation: srcfld, aggfun = aggregation[outfld] if srcfld is None: aggval = aggfun(rows) outrow.append(aggval) elif isinstance(srcfld, (list, tuple)): idxs = [hdr.index(f) for f in srcfld] valgetter = operator.itemgetter(*idxs) vals = (valgetter(row) for row in rows) aggval = aggfun(vals) outrow.append(aggval) else: idx = hdr.index(srcfld) # try using generator comprehension vals = (row[idx] for row in rows) aggval = aggfun(vals) outrow.append(aggval) yield tuple(outrow)
def __init__(self, source, key, aggregation=None, presorted=False, buffersize=None, tempdir=None, cache=True): if presorted: self.source = source else: self.source = sort(source, key, buffersize=buffersize, tempdir=tempdir, cache=cache) self.key = key if aggregation is None: self.aggregation = OrderedDict() elif isinstance(aggregation, (list, tuple)): self.aggregation = OrderedDict() for t in aggregation: self.aggregation[t[0]] = t[1:] elif isinstance(aggregation, dict): self.aggregation = aggregation else: raise ArgumentError( 'expected aggregation is None, list, tuple or dict, found %r' % aggregation)
def __init__(self, numrows=100, fields=None, wait=0, seed=None): self.numrows = numrows self.wait = wait if fields is None: self.fields = OrderedDict() else: self.fields = OrderedDict(fields) if seed is None: self.seed = datetime.datetime.now() else: self.seed = seed
def test_fromdicts_ordered(): from petl.compat import OrderedDict data = [ OrderedDict([('foo', 'a'), ('bar', 1)]), OrderedDict([('foo', 'b')]), OrderedDict([('foo', 'c'), ('bar', 2), ('baz', True)]) ] actual = fromdicts(data) # N.B., fields come out in original order expect = (('foo', 'bar', 'baz'), ('a', 1, None), ('b', None, None), ('c', 2, True)) ieq(expect, actual)
def test_rangeaggregate_multifield(): table1 = (('foo', 'bar'), ('a', 3), ('a', 7), ('b', 2), ('b', 1), ('b', 9), ('c', 4), ('d', 3)) # dict arg aggregators = OrderedDict() aggregators['foocount'] = len aggregators['foojoin'] = 'foo', strjoin('') aggregators['foolist'] = 'foo' # default is list table2 = rangeaggregate(table1, 'bar', 2, aggregators) expect2 = (('bar', 'foocount', 'foojoin', 'foolist'), ((1, 3), 2, 'bb', ['b', 'b']), ((3, 5), 3, 'adc', ['a', 'd', 'c']), ((5, 7), 0, '', []), ((7, 9), 1, 'a', ['a']), ((9, 11), 1, 'b', ['b'])) ieq(expect2, table2) # suffix notation table3 = rangeaggregate(table1, 'bar', 2) table3['foocount'] = len table3['foojoin'] = 'foo', strjoin('') table3['foolist'] = 'foo' # default is list ieq(expect2, table3) # list arg aggregators = [('foocount', len), ('foojoin', 'foo', strjoin('')), ('foolist', 'foo', list)] table4 = rangeaggregate(table1, 'bar', 2, aggregators) ieq(expect2, table4)
def test_aggregate_more(): table1 = (('foo', 'bar'), ('aa', 3), ('aa', 7), ('bb', 2), ('bb', 1), ('bb', 9), ('cc', 4), ('dd', 3)) aggregators = OrderedDict() aggregators['minbar'] = 'bar', min aggregators['maxbar'] = 'bar', max aggregators['sumbar'] = 'bar', sum aggregators['listbar'] = 'bar' # default aggregation is list aggregators['bars'] = 'bar', strjoin(', ') table2 = aggregate(table1, 'foo', aggregators) expect2 = (('foo', 'minbar', 'maxbar', 'sumbar', 'listbar', 'bars'), ('aa', 3, 7, 10, [3, 7], '3, 7'), ('bb', 1, 9, 12, [2, 1, 9], '2, 1, 9'), ('cc', 4, 4, 4, [4], '4'), ('dd', 3, 3, 3, [3], '3')) ieq(expect2, table2) ieq(expect2, table2) # check can iterate twice table3 = aggregate(table1, 'foo') table3['minbar'] = 'bar', min table3['maxbar'] = 'bar', max table3['sumbar'] = 'bar', sum table3['listbar'] = 'bar' # default aggregation is list table3['bars'] = 'bar', strjoin(', ') ieq(expect2, table3)
def columns(table, missing=None): """ Construct a :class:`dict` mapping field names to lists of values. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ['a', 1], ['b', 2], ['b', 3]] >>> cols = etl.columns(table) >>> cols['foo'] ['a', 'b', 'b'] >>> cols['bar'] [1, 2, 3] See also :func:`petl.util.materialise.facetcolumns`. """ cols = OrderedDict() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) for f in flds: cols[f] = list() for row in it: for f, v in izip_longest(flds, row, fillvalue=missing): if f in cols: cols[f].append(v) return cols
def test_fieldmap_empty(): table = (('foo', 'bar'), ) expect = (('foo', 'baz'), ) mappings = OrderedDict() mappings['foo'] = 'foo' mappings['baz'] = 'bar', lambda v: v * 2 actual = fieldmap(table, mappings) ieq(expect, actual)
def itermultirangeaggregate(source, key, width, aggregation, minv, maxv): aggregation = OrderedDict(aggregation.items()) # take a copy it = iter(source) srcflds = it.next() # push back header to ensure we iterate only once it = itertools.chain([srcflds], it) # normalise aggregators for outfld in aggregation: agg = aggregation[outfld] if callable(agg): aggregation[outfld] = None, agg elif isinstance(agg, basestring): aggregation[outfld] = agg, list # list is default elif len(agg) == 1 and isinstance(agg[0], basestring): aggregation[outfld] = agg[0], list # list is default elif len(agg) == 1 and callable(agg[0]): aggregation[outfld] = None, agg[0] # aggregate whole rows elif len(agg) == 2: pass # no need to normalise else: raise Exception('invalid aggregation: %r, %r' % (outfld, agg)) outflds = [key] for outfld in aggregation: outflds.append(outfld) yield tuple(outflds) for k, rows in rowgroupbybin(it, key, width, minv=minv, maxv=maxv): outrow = [k] for outfld in aggregation: srcfld, aggfun = aggregation[outfld] if srcfld is None: aggval = aggfun(rows) outrow.append(aggval) else: idx = srcflds.index(srcfld) # try using generator comprehension vals = (row[idx] for row in rows) aggval = aggfun(vals) outrow.append(aggval) yield tuple(outrow)
def test_aggregate_empty(): table = (('foo', 'bar'), ) aggregators = OrderedDict() aggregators['minbar'] = 'bar', min aggregators['maxbar'] = 'bar', max aggregators['sumbar'] = 'bar', sum actual = aggregate(table, 'foo', aggregators) expect = (('foo', 'minbar', 'maxbar', 'sumbar'), ) ieq(expect, actual)
def __init__(self, source, mappings=None, failonerror=False, errorvalue=None): self.source = source if mappings is None: self.mappings = OrderedDict() else: self.mappings = mappings self.failonerror = failonerror self.errorvalue = errorvalue
def test_fieldmap(): table = (('id', 'sex', 'age', 'height', 'weight'), (1, 'male', 16, 1.45, 62.0), (2, 'female', 19, 1.34, 55.4), (3, 'female', 17, 1.78, 74.4), (4, 'male', 21, 1.33, 45.2), (5, '-', 25, 1.65, 51.9)) mappings = OrderedDict() mappings['subject_id'] = 'id' mappings['gender'] = 'sex', {'male': 'M', 'female': 'F'} mappings['age_months'] = 'age', lambda v: v * 12 mappings['bmi'] = lambda rec: rec['weight'] / rec['height']**2 actual = fieldmap(table, mappings) expect = (('subject_id', 'gender', 'age_months', 'bmi'), (1, 'M', 16*12, 62.0/1.45**2), (2, 'F', 19*12, 55.4/1.34**2), (3, 'F', 17*12, 74.4/1.78**2), (4, 'M', 21*12, 45.2/1.33**2), (5, '-', 25*12, 51.9/1.65**2)) ieq(expect, actual) ieq(expect, actual) # can iteratate twice? # do it with suffix actual = fieldmap(table) actual['subject_id'] = 'id' actual['gender'] = 'sex', {'male': 'M', 'female': 'F'} actual['age_months'] = 'age', lambda v: v * 12 actual['bmi'] = '{weight} / {height}**2' ieq(expect, actual) # test short rows table2 = (('id', 'sex', 'age', 'height', 'weight'), (1, 'male', 16, 1.45, 62.0), (2, 'female', 19, 1.34, 55.4), (3, 'female', 17, 1.78, 74.4), (4, 'male', 21, 1.33, 45.2), (5, '-', 25, 1.65)) expect = (('subject_id', 'gender', 'age_months', 'bmi'), (1, 'M', 16*12, 62.0/1.45**2), (2, 'F', 19*12, 55.4/1.34**2), (3, 'F', 17*12, 74.4/1.78**2), (4, 'M', 21*12, 45.2/1.33**2), (5, '-', 25*12, None)) actual = fieldmap(table2, mappings) ieq(expect, actual)
def test_aggregate_multifield(): table1 = (('foo', 'bar'), ('a', 3), ('a', 7), ('b', 2), ('b', 1), ('b', 9), ('c', 4)) # dict arg aggregators = OrderedDict() aggregators['count'] = len aggregators['minbar'] = 'bar', min aggregators['maxbar'] = 'bar', max aggregators['sumbar'] = 'bar', sum aggregators['listbar'] = 'bar', list aggregators['bars'] = 'bar', strjoin(', ') table2 = aggregate(table1, 'foo', aggregators) expect2 = (('foo', 'count', 'minbar', 'maxbar', 'sumbar', 'listbar', 'bars'), ('a', 2, 3, 7, 10, [3, 7], '3, 7'), ('b', 3, 1, 9, 12, [2, 1, 9], '2, 1, 9'), ('c', 1, 4, 4, 4, [4], '4')) ieq(expect2, table2) ieq(expect2, table2) # check can iterate twice # use suffix notation table3 = aggregate(table1, 'foo') table3['count'] = len table3['minbar'] = 'bar', min table3['maxbar'] = 'bar', max table3['sumbar'] = 'bar', sum table3['listbar'] = 'bar' # default aggregation is list table3['bars'] = 'bar', strjoin(', ') ieq(expect2, table3) # list arg aggregators = [('count', len), ('minbar', 'bar', min), ('maxbar', 'bar', max), ('sumbar', 'bar', sum), ('listbar', 'bar', list), ('bars', 'bar', strjoin(', '))] table4 = aggregate(table1, 'foo', aggregators) ieq(expect2, table4) ieq(expect2, table4) # check can iterate twice
def test_fieldmap_record_access(): table = (('id', 'sex', 'age', 'height', 'weight'), (1, 'male', 16, 1.45, 62.0), (2, 'female', 19, 1.34, 55.4), (3, 'female', 17, 1.78, 74.4), (4, 'male', 21, 1.33, 45.2), (5, '-', 25, 1.65, 51.9)) mappings = OrderedDict() mappings['subject_id'] = 'id' mappings['gender'] = 'sex', {'male': 'M', 'female': 'F'} mappings['age_months'] = 'age', lambda v: v * 12 mappings['bmi'] = lambda rec: rec.weight / rec.height**2 actual = fieldmap(table, mappings) expect = (('subject_id', 'gender', 'age_months', 'bmi'), (1, 'M', 16 * 12, 62.0 / 1.45**2), (2, 'F', 19 * 12, 55.4 / 1.34**2), (3, 'F', 17 * 12, 74.4 / 1.78**2), (4, 'M', 21 * 12, 45.2 / 1.33**2), (5, '-', 25 * 12, 51.9 / 1.65**2)) ieq(expect, actual) ieq(expect, actual) # can iteratate twice?
class DummyTable(Table): def __init__(self, numrows=100, fields=None, wait=0, seed=None): self.numrows = numrows self.wait = wait if fields is None: self.fields = OrderedDict() else: self.fields = OrderedDict(fields) if seed is None: self.seed = datetime.datetime.now() else: self.seed = seed def __setitem__(self, item, value): self.fields[str(item)] = value def __iter__(self): nr = self.numrows seed = self.seed fields = self.fields.copy() # N.B., we want this to be stable, i.e., same data each time random.seed(seed) # construct header row hdr = tuple(str(f) for f in fields.keys()) yield hdr # construct data rows for _ in xrange(nr): # artificial delay if self.wait: time.sleep(self.wait) yield tuple(fields[f]() for f in fields) def reseed(self): self.seed = datetime.datetime.now()
class DummyTable(Table): def __init__(self, numrows=100, fields=None, wait=0, seed=None): self.numrows = numrows self.wait = wait if fields is None: self.fields = OrderedDict() else: self.fields = OrderedDict(fields) if seed is None: self.seed = datetime.datetime.now() else: self.seed = seed def __setitem__(self, item, value): self.fields[text_type(item)] = value def __iter__(self): nr = self.numrows seed = self.seed fields = self.fields.copy() # N.B., we want this to be stable, i.e., same data each time random.seed(seed) # construct header row hdr = tuple(text_type(f) for f in fields.keys()) yield hdr # construct data rows for _ in xrange(nr): # artificial delay if self.wait: time.sleep(self.wait) yield tuple(fields[f]() for f in fields) def reseed(self): self.seed = datetime.datetime.now()
def ordereddict(self): return OrderedDict(self)
def rangefacet(table, field, width, minv=None, maxv=None, presorted=False, buffersize=None, tempdir=None, cache=True): """ Return a dictionary mapping ranges to tables. E.g.:: >>> from petl import rangefacet, look >>> look(table1) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | 3 | +-------+-------+ | 'a' | 7 | +-------+-------+ | 'b' | 2 | +-------+-------+ | 'b' | 1 | +-------+-------+ | 'b' | 9 | +-------+-------+ | 'c' | 4 | +-------+-------+ | 'd' | 3 | +-------+-------+ >>> rf = rangefacet(table1, 'bar', 2) >>> rf.keys() [(1, 3), (3, 5), (5, 7), (7, 9)] >>> look(rf[(1, 3)]) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'b' | 2 | +-------+-------+ | 'b' | 1 | +-------+-------+ >>> look(rf[(7, 9)]) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | 7 | +-------+-------+ | 'b' | 9 | +-------+-------+ Note that the last bin includes both edges. """ # determine minimum and maximum values if minv is None and maxv is None: minv, maxv = limits(table, field) elif minv is None: minv = min(itervalues(table, field)) elif max is None: maxv = max(itervalues(table, field)) fct = OrderedDict() for binminv in xrange(minv, maxv, width): binmaxv = binminv + width if binmaxv >= maxv: # final bin binmaxv = maxv # final bin includes right edge fct[(binminv, binmaxv)] = selectrangeopen(table, field, binminv, binmaxv) else: fct[(binminv, binmaxv)] = selectrangeopenleft(table, field, binminv, binmaxv) return fct