Esempio n. 1
0
def groupcountdistinctvalues(table, key, value):
    """Group by the `key` field then count the number of distinct values in the
    `value` field."""

    s1 = cut(table, key, value)
    s2 = distinct(s1)
    s3 = aggregate(s2, key, len)
    return s3
Esempio n. 2
0
def groupcountdistinctvalues(table, key, value):
    """Group by the `key` field then count the number of distinct values in the
    `value` field."""
    
    s1 = cut(table, key, value)
    s2 = distinct(s1)
    s3 = aggregate(s2, key, len)
    return s3
Esempio n. 3
0
def test_key_distinct_2():
    # test for https://github.com/alimanfoo/petl/issues/318

    tbl = (('a', 'b'), ('x', '1'), ('x', '3'), ('y', '1'), (None, None))

    result = distinct(tbl, key='b')
    expect = (('a', 'b'), (None, None), ('x', '1'), ('x', '3'))
    ieq(expect, result)
Esempio n. 4
0
def test_distinct_count():

    table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'),
             ('B', '2', '3.4'), ('D', 4, 12.3))

    result = distinct(table, count='count')
    expect = (('foo', 'bar', 'baz', 'count'), ('A', 1, 2, 1),
              ('B', '2', '3.4', 2), ('D', 4, 12.3, 1))
    ieq(expect, result)
Esempio n. 5
0
def test_distinct():

    table = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'),
             ('B', '2', '3.4'), ('D', 4, 12.3))

    result = distinct(table)
    expect = (('foo', 'bar', 'baz'), ('A', 1, 2), ('B', '2', '3.4'), ('D', 4,
                                                                      12.3))
    ieq(expect, result)
Esempio n. 6
0
def test_key_distinct():

    table = (('foo', 'bar', 'baz'), (None, None, None), ('A', 1, 2),
             ('B', '2', '3.4'), ('B', '2', '5'), ('D', 4, 12.3))

    result = distinct(table, key='foo')
    expect = (('foo', 'bar', 'baz'), (None, None, None), ('A', 1, 2),
              ('B', '2', '3.4'), ('D', 4, 12.3))
    ieq(expect, result)
Esempio n. 7
0
def test_key_distinct_2():
    # test for https://github.com/alimanfoo/petl/issues/318

    tbl = (('a', 'b'),
           ('x', '1'),
           ('x', '3'),
           ('y', '1'))

    result = distinct(tbl, key='b')
    expect = (('a', 'b'),
              ('x', '1'),
              ('x', '3'))
    ieq(expect, result)
Esempio n. 8
0
def test_key_distinct_count():

    table = (('foo', 'bar', 'baz'),
             ('A', 1, 2),
             ('B', '2', '3.4'),
             ('B', '2', '5'),
             ('D', 4, 12.3))

    result = distinct(table, key='foo', count='count')
    expect = (('foo', 'bar', 'baz', 'count'),
              ('A', 1, 2, 1),
              ('B', '2', '3.4', 2),
              ('D', 4, 12.3, 1))
    ieq(expect, result)
Esempio n. 9
0
def test_distinct():

    table = (('foo', 'bar', 'baz'),
             ('A', 1, 2),
             ('B', '2', '3.4'),
             ('B', '2', '3.4'),
             ('D', 4, 12.3))

    result = distinct(table)
    expect = (('foo', 'bar', 'baz'),
              ('A', 1, 2),
              ('B', '2', '3.4'),
              ('D', 4, 12.3))
    ieq(expect, result)
Esempio n. 10
0
def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False,
           buffersize=None, tempdir=None, cache=True):
    """
    Split a table into two tables by reversing an inner join.

    E.g., if the join key is present in the table::

        >>> from petl import look, unjoin
        >>> look(table1)
        +-------+-------+----------+
        | 'foo' | 'bar' | 'baz'    |
        +=======+=======+==========+
        | 'A'   | 1     | 'apple'  |
        +-------+-------+----------+
        | 'B'   | 1     | 'apple'  |
        +-------+-------+----------+
        | 'C'   | 2     | 'orange' |
        +-------+-------+----------+

        >>> table2, table3 = unjoin(table1, 'baz', key='bar')
        >>> look(table2)
        +-------+-------+
        | 'foo' | 'bar' |
        +=======+=======+
        | 'A'   | 1     |
        +-------+-------+
        | 'B'   | 1     |
        +-------+-------+
        | 'C'   | 2     |
        +-------+-------+

        >>> look(table3)
        +-------+----------+
        | 'bar' | 'baz'    |
        +=======+==========+
        | 1     | 'apple'  |
        +-------+----------+
        | 2     | 'orange' |
        +-------+----------+

    An integer join key can also be reconstructed, e.g.::

        >>> look(table4)
        +-------+----------+
        | 'foo' | 'bar'    |
        +=======+==========+
        | 'A'   | 'apple'  |
        +-------+----------+
        | 'B'   | 'apple'  |
        +-------+----------+
        | 'C'   | 'orange' |
        +-------+----------+

        >>> table5, table6 = unjoin(table4, 'bar')
        >>> look(table5)
        +-------+----------+
        | 'foo' | 'bar_id' |
        +=======+==========+
        | 'A'   | 1        |
        +-------+----------+
        | 'B'   | 1        |
        +-------+----------+
        | 'C'   | 2        |
        +-------+----------+

        >>> look(table6)
        +------+----------+
        | 'id' | 'bar'    |
        +======+==========+
        | 1    | 'apple'  |
        +------+----------+
        | 2    | 'orange' |
        +------+----------+

    .. versionadded:: 0.12

    """

    if key is None:
        # first sort the table by the value field
        if presorted:
            tbl_sorted = table
        else:
            tbl_sorted = sort(table, value, buffersize=buffersize,
                              tempdir=tempdir, cache=cache)
        # on the left, return the original table but with the value field
        # replaced by an incrementing integer
        left = ConvertToIncrementingCounterView(tbl_sorted, value,
                                                autoincrement)
        # on the right, return a new table with distinct values from the
        # given field
        right = EnumerateDistinctView(tbl_sorted, value, autoincrement)
    else:
        # on the left, return distinct rows from the original table
        # with the value field cut out
        left = distinct(cutout(table, value))
        # on the right, return distinct rows from the original table
        # with all fields but the key and value cut out
        right = distinct(cut(table, key, value))
    return left, right
Esempio n. 11
0
def unjoin(table,
           value,
           key=None,
           autoincrement=(1, 1),
           presorted=False,
           buffersize=None,
           tempdir=None,
           cache=True):
    """
    Split a table into two tables by reversing an inner join. E.g.::

        >>> import petl as etl
        >>> # join key is present in the table
        ... table1 = (('foo', 'bar', 'baz'),
        ...           ('A', 1, 'apple'),
        ...           ('B', 1, 'apple'),
        ...           ('C', 2, 'orange'))
        >>> table2, table3 = etl.unjoin(table1, 'baz', key='bar')
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'A' |   1 |
        +-----+-----+
        | 'B' |   1 |
        +-----+-----+
        | 'C' |   2 |
        +-----+-----+

        >>> table3
        +-----+----------+
        | bar | baz      |
        +=====+==========+
        |   1 | 'apple'  |
        +-----+----------+
        |   2 | 'orange' |
        +-----+----------+

        >>> # an integer join key can also be reconstructed
        ... table4 = (('foo', 'bar'),
        ...           ('A', 'apple'),
        ...           ('B', 'apple'),
        ...           ('C', 'orange'))
        >>> table5, table6 = etl.unjoin(table4, 'bar')
        >>> table5
        +-----+--------+
        | foo | bar_id |
        +=====+========+
        | 'A' |      1 |
        +-----+--------+
        | 'B' |      1 |
        +-----+--------+
        | 'C' |      2 |
        +-----+--------+

        >>> table6
        +----+----------+
        | id | bar      |
        +====+==========+
        |  1 | 'apple'  |
        +----+----------+
        |  2 | 'orange' |
        +----+----------+

    The `autoincrement` parameter controls how an integer join key is
    reconstructed, and should be a tuple of (`start`, `step`).

    """

    if key is None:
        # first sort the table by the value field
        if presorted:
            tbl_sorted = table
        else:
            tbl_sorted = sort(table,
                              value,
                              buffersize=buffersize,
                              tempdir=tempdir,
                              cache=cache)
        # on the left, return the original table but with the value field
        # replaced by an incrementing integer
        left = ConvertToIncrementingCounterView(tbl_sorted, value,
                                                autoincrement)
        # on the right, return a new table with distinct values from the
        # given field
        right = EnumerateDistinctView(tbl_sorted, value, autoincrement)
    else:
        # on the left, return distinct rows from the original table
        # with the value field cut out
        left = distinct(cutout(table, value))
        # on the right, return distinct rows from the original table
        # with all fields but the key and value cut out
        right = distinct(cut(table, key, value))
    return left, right
Esempio n. 12
0
def unjoin(table, value, key=None, autoincrement=(1, 1), presorted=False,
           buffersize=None, tempdir=None, cache=True):
    """
    Split a table into two tables by reversing an inner join. E.g.::

        >>> import petl as etl
        >>> # join key is present in the table
        ... table1 = (('foo', 'bar', 'baz'),
        ...           ('A', 1, 'apple'),
        ...           ('B', 1, 'apple'),
        ...           ('C', 2, 'orange'))
        >>> table2, table3 = etl.unjoin(table1, 'baz', key='bar')
        >>> table2
        +-----+-----+
        | foo | bar |
        +=====+=====+
        | 'A' |   1 |
        +-----+-----+
        | 'B' |   1 |
        +-----+-----+
        | 'C' |   2 |
        +-----+-----+

        >>> table3
        +-----+----------+
        | bar | baz      |
        +=====+==========+
        |   1 | 'apple'  |
        +-----+----------+
        |   2 | 'orange' |
        +-----+----------+

        >>> # an integer join key can also be reconstructed
        ... table4 = (('foo', 'bar'),
        ...           ('A', 'apple'),
        ...           ('B', 'apple'),
        ...           ('C', 'orange'))
        >>> table5, table6 = etl.unjoin(table4, 'bar')
        >>> table5
        +-----+--------+
        | foo | bar_id |
        +=====+========+
        | 'A' |      1 |
        +-----+--------+
        | 'B' |      1 |
        +-----+--------+
        | 'C' |      2 |
        +-----+--------+

        >>> table6
        +----+----------+
        | id | bar      |
        +====+==========+
        |  1 | 'apple'  |
        +----+----------+
        |  2 | 'orange' |
        +----+----------+

    The `autoincrement` parameter controls how an integer join key is
    reconstructed, and should be a tuple of (`start`, `step`).

    """

    if key is None:
        # first sort the table by the value field
        if presorted:
            tbl_sorted = table
        else:
            tbl_sorted = sort(table, value, buffersize=buffersize,
                              tempdir=tempdir, cache=cache)
        # on the left, return the original table but with the value field
        # replaced by an incrementing integer
        left = ConvertToIncrementingCounterView(tbl_sorted, value,
                                                autoincrement)
        # on the right, return a new table with distinct values from the
        # given field
        right = EnumerateDistinctView(tbl_sorted, value, autoincrement)
    else:
        # on the left, return distinct rows from the original table
        # with the value field cut out
        left = distinct(cutout(table, value))
        # on the right, return distinct rows from the original table
        # with all fields but the key and value cut out
        right = distinct(cut(table, key, value))
    return left, right