def test_cut(): table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), (u'B', u'3', u'7.8', True), ('D', 'xyz', 9.0), ('E', None)) cut1 = cut(table, 'foo') expectation = (('foo', ), ('A', ), ('B', ), (u'B', ), ('D', ), ('E', )) ieq(expectation, cut1) cut2 = cut(table, 'foo', 'baz') expectation = (('foo', 'baz'), ('A', 2), ('B', '3.4'), (u'B', u'7.8'), ('D', 9.0), ('E', None)) ieq(expectation, cut2) cut3 = cut(table, 0, 2) expectation = (('foo', 'baz'), ('A', 2), ('B', '3.4'), (u'B', u'7.8'), ('D', 9.0), ('E', None)) ieq(expectation, cut3) cut4 = cut(table, 'bar', 0) expectation = (('bar', 'foo'), (1, 'A'), ('2', 'B'), (u'3', u'B'), ('xyz', 'D'), (None, 'E')) ieq(expectation, cut4) cut5 = cut(table, ('foo', 'baz')) expectation = (('foo', 'baz'), ('A', 2), ('B', '3.4'), (u'B', u'7.8'), ('D', 9.0), ('E', None)) ieq(expectation, cut5)
def test_cut(): table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), (u'B', u'3', u'7.8', True), ('D', 'xyz', 9.0), ('E', None)) cut1 = cut(table, 'foo') expectation = (('foo',), ('A',), ('B',), (u'B',), ('D',), ('E',)) ieq(expectation, cut1) cut2 = cut(table, 'foo', 'baz') expectation = (('foo', 'baz'), ('A', 2), ('B', '3.4'), (u'B', u'7.8'), ('D', 9.0), ('E', None)) ieq(expectation, cut2) cut3 = cut(table, 0, 2) expectation = (('foo', 'baz'), ('A', 2), ('B', '3.4'), (u'B', u'7.8'), ('D', 9.0), ('E', None)) ieq(expectation, cut3) cut4 = cut(table, 'bar', 0) expectation = (('bar', 'foo'), (1, 'A'), ('2', 'B'), (u'3', u'B'), ('xyz', 'D'), (None, 'E')) ieq(expectation, cut4) cut5 = cut(table, ('foo', 'baz')) expectation = (('foo', 'baz'), ('A', 2), ('B', '3.4'), (u'B', u'7.8'), ('D', 9.0), ('E', None)) ieq(expectation, cut5)
def groupcountdistinctvalues(table, key, value): """Group by the `key` field then count the number of distinct values in the `value` field.""" s1 = cut(table, key, value) s2 = distinct(s1) s3 = aggregate(s2, key, len) return s3
def recordcomplement(a, b, buffersize=None, tempdir=None, cache=True, strict=False): """ Find records in `a` that are not in `b`. E.g.:: >>> import petl as etl >>> a = [['foo', 'bar', 'baz'], ... ['A', 1, True], ... ['C', 7, False], ... ['B', 2, False], ... ['C', 9, True]] >>> b = [['bar', 'foo', 'baz'], ... [2, 'B', False], ... [9, 'A', False], ... [3, 'B', True], ... [9, 'C', True]] >>> aminusb = etl.recordcomplement(a, b) >>> aminusb +-----+-----+-------+ | foo | bar | baz | +=====+=====+=======+ | 'A' | 1 | True | +-----+-----+-------+ | 'C' | 7 | False | +-----+-----+-------+ >>> bminusa = etl.recordcomplement(b, a) >>> bminusa +-----+-----+-------+ | bar | foo | baz | +=====+=====+=======+ | 3 | 'B' | True | +-----+-----+-------+ | 9 | 'A' | False | +-----+-----+-------+ Note that both tables must have the same set of fields, but that the order of the fields does not matter. See also the :func:`petl.transform.setops.complement` function. See also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`petl.transform.sorts.sort` function. """ # TODO possible with only one pass? ha = header(a) hb = header(b) assert set(ha) == set(hb), 'both tables must have the same set of fields' # make sure fields are in the same order bv = cut(b, *ha) return complement(a, bv, buffersize=buffersize, tempdir=tempdir, cache=cache, strict=strict)
def test_cut_empty(): table = (('foo', 'bar'),) expect = (('bar',),) actual = cut(table, 'bar') ieq(expect, actual)
def test_cut_empty(): table = (('foo', 'bar'), ) expect = (('bar', ), ) actual = cut(table, 'bar') ieq(expect, actual)
def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False, buffersize=None, tempdir=None, cache=True): """ Split a table into two tables by reversing an inner join. E.g., if the join key is present in the table:: >>> from petl import look, unjoin >>> look(table1) +-------+-------+----------+ | 'foo' | 'bar' | 'baz' | +=======+=======+==========+ | 'A' | 1 | 'apple' | +-------+-------+----------+ | 'B' | 1 | 'apple' | +-------+-------+----------+ | 'C' | 2 | 'orange' | +-------+-------+----------+ >>> table2, table3 = unjoin(table1, 'baz', key='bar') >>> look(table2) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'A' | 1 | +-------+-------+ | 'B' | 1 | +-------+-------+ | 'C' | 2 | +-------+-------+ >>> look(table3) +-------+----------+ | 'bar' | 'baz' | +=======+==========+ | 1 | 'apple' | +-------+----------+ | 2 | 'orange' | +-------+----------+ An integer join key can also be reconstructed, e.g.:: >>> look(table4) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 'A' | 'apple' | +-------+----------+ | 'B' | 'apple' | +-------+----------+ | 'C' | 'orange' | +-------+----------+ >>> table5, table6 = unjoin(table4, 'bar') >>> look(table5) +-------+----------+ | 'foo' | 'bar_id' | +=======+==========+ | 'A' | 1 | +-------+----------+ | 'B' | 1 | +-------+----------+ | 'C' | 2 | +-------+----------+ >>> look(table6) +------+----------+ | 'id' | 'bar' | +======+==========+ | 1 | 'apple' | +------+----------+ | 2 | 'orange' | +------+----------+ .. versionadded:: 0.12 """ if key is None: # first sort the table by the value field if presorted: tbl_sorted = table else: tbl_sorted = sort(table, value, buffersize=buffersize, tempdir=tempdir, cache=cache) # on the left, return the original table but with the value field # replaced by an incrementing integer left = ConvertToIncrementingCounterView(tbl_sorted, value, autoincrement) # on the right, return a new table with distinct values from the # given field right = EnumerateDistinctView(tbl_sorted, value, autoincrement) else: # on the left, return distinct rows from the original table # with the value field cut out left = distinct(cutout(table, value)) # on the right, return distinct rows from the original table # with all fields but the key and value cut out right = distinct(cut(table, key, value)) return left, right
def recordcomplement(a, b, buffersize=None, tempdir=None, cache=True): """ Find records in `a` that are not in `b`. E.g.:: >>> from petl import recordcomplement, look >>> look(a) +-------+-------+-------+ | 'foo' | 'bar' | 'baz' | +=======+=======+=======+ | 'A' | 1 | True | +-------+-------+-------+ | 'C' | 7 | False | +-------+-------+-------+ | 'B' | 2 | False | +-------+-------+-------+ | 'C' | 9 | True | +-------+-------+-------+ >>> look(b) +-------+-------+-------+ | 'bar' | 'foo' | 'baz' | +=======+=======+=======+ | 2 | 'B' | False | +-------+-------+-------+ | 9 | 'A' | False | +-------+-------+-------+ | 3 | 'B' | True | +-------+-------+-------+ | 9 | 'C' | True | +-------+-------+-------+ >>> aminusb = recordcomplement(a, b) >>> look(aminusb) +-------+-------+-------+ | 'foo' | 'bar' | 'baz' | +=======+=======+=======+ | 'A' | 1 | True | +-------+-------+-------+ | 'C' | 7 | False | +-------+-------+-------+ >>> bminusa = recordcomplement(b, a) >>> look(bminusa) +-------+-------+-------+ | 'bar' | 'foo' | 'baz' | +=======+=======+=======+ | 3 | 'B' | True | +-------+-------+-------+ | 9 | 'A' | False | +-------+-------+-------+ Note that both tables must have the same set of fields, but that the order of the fields does not matter. See also the :func:`complement` function. See also the discussion of the `buffersize`, `tempdir` and `cache` arguments under the :func:`sort` function. .. versionadded:: 0.3 """ ha = header(a) hb = header(b) assert set(ha) == set(hb), 'both tables must have the same set of fields' # make sure fields are in the same order bv = cut(b, *ha) return complement(a, bv, buffersize=buffersize, tempdir=tempdir, cache=cache)
def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False, buffersize=None, tempdir=None, cache=True): """ Split a table into two tables by reversing an inner join. E.g.:: >>> import petl as etl >>> # join key is present in the table ... table1 = (('foo', 'bar', 'baz'), ... ('A', 1, 'apple'), ... ('B', 1, 'apple'), ... ('C', 2, 'orange')) >>> table2, table3 = etl.unjoin(table1, 'baz', key='bar') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'A' | 1 | +-----+-----+ | 'B' | 1 | +-----+-----+ | 'C' | 2 | +-----+-----+ >>> table3 +-----+----------+ | bar | baz | +=====+==========+ | 1 | 'apple' | +-----+----------+ | 2 | 'orange' | +-----+----------+ >>> # an integer join key can also be reconstructed ... table4 = (('foo', 'bar'), ... ('A', 'apple'), ... ('B', 'apple'), ... ('C', 'orange')) >>> table5, table6 = etl.unjoin(table4, 'bar') >>> table5 +-----+--------+ | foo | bar_id | +=====+========+ | 'A' | 1 | +-----+--------+ | 'B' | 1 | +-----+--------+ | 'C' | 2 | +-----+--------+ >>> table6 +----+----------+ | id | bar | +====+==========+ | 1 | 'apple' | +----+----------+ | 2 | 'orange' | +----+----------+ The `autoincrement` parameter controls how an integer join key is reconstructed, and should be a tuple of (`start`, `step`). """ if key is None: # first sort the table by the value field if presorted: tbl_sorted = table else: tbl_sorted = sort(table, value, buffersize=buffersize, tempdir=tempdir, cache=cache) # on the left, return the original table but with the value field # replaced by an incrementing integer left = ConvertToIncrementingCounterView(tbl_sorted, value, autoincrement) # on the right, return a new table with distinct values from the # given field right = EnumerateDistinctView(tbl_sorted, value, autoincrement) else: # on the left, return distinct rows from the original table # with the value field cut out left = distinct(cutout(table, value)) # on the right, return distinct rows from the original table # with all fields but the key and value cut out right = distinct(cut(table, key, value)) return left, right