Exemple #1
0
def compute_up(expr, data, **kwargs):
    names = data.c.keys()
    assert names == expr._child.fields
    d = dict(zip(names, getattr(data, "inner_columns", data.c)))
    return sa.select(
        d[col].label(new_col) if col != new_col else d[col] for col, new_col in zip(expr._child.fields, expr.fields)
    )
Exemple #2
0
def test_unzip():
    def _to_lists(seq, n=10):
        """iter of iters -> finite list of finite lists
        """
        def initial(s):
            return list(take(n, s))

        return initial(map(initial, seq))

    def _assert_initial_matches(a, b, n=10):
        assert list(take(n, a)) == list(take(n, b))

    # Unzips a simple list correctly
    assert _to_lists(unzip([('a', 1), ('b', 2), ('c', 3)])) \
        == [['a', 'b', 'c'], [1, 2, 3]]

    # Can handle a finite number of infinite iterators (the naive unzip
    # implementation `zip(*args)` impelementation fails on this example).
    a, b, c = unzip(zip(count(1), repeat(0), repeat(1)))
    _assert_initial_matches(a, count(1))
    _assert_initial_matches(b, repeat(0))
    _assert_initial_matches(c, repeat(1))

    # Sensibly handles empty input
    assert list(unzip(zip([]))) == []
Exemple #3
0
def test_unzip():
    def _to_lists(seq, n=10):
        """iter of iters -> finite list of finite lists
        """
        def initial(s):
            return list(take(n, s))

        return initial(map(initial, seq))

    def _assert_initial_matches(a, b, n=10):
        assert list(take(n, a)) == list(take(n, b))

    # Unzips a simple list correctly
    assert _to_lists(unzip([('a', 1), ('b', 2), ('c', 3)])) \
        == [['a', 'b', 'c'], [1, 2, 3]]

    # Can handle a finite number of infinite iterators (the naive unzip
    # implementation `zip(*args)` impelementation fails on this example).
    a, b, c = unzip(zip(count(1), repeat(0), repeat(1)))
    _assert_initial_matches(a, count(1))
    _assert_initial_matches(b, repeat(0))
    _assert_initial_matches(c, repeat(1))

    # Sensibly handles empty input
    assert list(unzip(zip([]))) == []
Exemple #4
0
def compute_up(expr, data, **kwargs):
    names = data.c.keys()
    assert names == expr._child.fields
    d = dict(zip(names, getattr(data, 'inner_columns', data.c)))
    return sa.select(
        d[col].label(new_col) if col != new_col else d[col]
        for col, new_col in zip(expr._child.fields, expr.fields)
    )
Exemple #5
0
def compute_up(expr, data, **kwargs):
    names = data.c.keys()
    assert names == expr._child.fields, (
        'names = %r\nexpr._child.fields = %r' % (names, expr._child.fields))
    d = dict(zip(names, getattr(data, 'inner_columns', data.c)))
    return reconstruct_select(
        (d[col].label(new_col) if col != new_col else d[col]
         for col, new_col in zip(expr._child.fields, expr.fields)),
        data,
    )
Exemple #6
0
def compute_up(expr, data, **kwargs):
    names = data.c.keys()
    assert names == expr._child.fields, (
        'names = %r\nexpr._child.fields = %r' % (names, expr._child.fields)
    )
    d = dict(zip(names, getattr(data, 'inner_columns', data.c)))
    return reconstruct_select(
        (d[col].label(new_col) if col != new_col else d[col]
         for col, new_col in zip(expr._child.fields, expr.fields)),
        data,
    )
Exemple #7
0
def from_delayed(values):
    """ Create bag from many dask.delayed objects

    Parameters
    ----------
    values: list of Values
        An iterable of dask.delayed.Value objects, such as come from dask.do
        These comprise the individual partitions of the resulting bag

    Returns
    -------
    Bag

    Examples
    --------
    >>> b = from_delayed([x, y, z])  # doctest: +SKIP
    """
    from dask.delayed import Value
    if isinstance(values, Value):
        values = [values]
    dsk = merge(v.dask for v in values)

    name = 'bag-from-delayed-' + tokenize(*values)
    names = [(name, i) for i in range(len(values))]
    values = [v.key for v in values]
    dsk2 = dict(zip(names, values))

    return Bag(merge(dsk, dsk2), name, len(values))
Exemple #8
0
def compute_up(expr, data, **kwargs):
    if not valid_grouper(expr.grouper):
        raise TypeError("Grouper must have a non-nested record or one "
                        "dimensional collection datashape, "
                        "got %s of type %r with dshape %s" %
                        (expr.grouper, type(expr.grouper).__name__,
                         expr.grouper.dshape))
    grouper = get_inner_columns(
        compute(
            expr.grouper,
            data,
            post_compute=False,
            return_type='native',
        ),
    )
    app = expr.apply
    reductions = [
        compute(
            val,
            data,
            post_compute=False,
            return_type='native',
        ).label(name)
        for val, name in zip(app.values, app.fields)
    ]

    return sa.select(grouper + reductions).group_by(*grouper)
Exemple #9
0
def compute_up(t, s, **kwargs):
    columns = [
        getattr(s.c, col).label(new_col) if col != new_col else getattr(
            s.c, col) for col, new_col in zip(t._child.fields, t.fields)
    ]

    return select(columns)
Exemple #10
0
def compute_up(t, s, **kwargs):
    columns = [getattr(s.c, col).label(new_col)
               if col != new_col else
               getattr(s.c, col)
               for col, new_col in zip(t._child.fields, t.fields)]

    return select(columns)
Exemple #11
0
def from_delayed(values):
    """ Create bag from many dask.delayed objects

    Parameters
    ----------
    values: list of Values
        An iterable of dask.delayed.Value objects, such as come from dask.do
        These comprise the individual partitions of the resulting bag

    Returns
    -------
    Bag

    Examples
    --------
    >>> b = from_delayed([x, y, z])  # doctest: +SKIP
    """
    from dask.delayed import Value
    if isinstance(values, Value):
        values = [values]
    dsk = merge(v.dask for v in values)

    name = 'bag-from-delayed-' + tokenize(*values)
    names = [(name, i) for i in range(len(values))]
    values = [v.key for v in values]
    dsk2 = dict(zip(names, values))

    return Bag(merge(dsk, dsk2), name, len(values))
Exemple #12
0
def compute_up(expr, data, scope=None, **kwargs):
    data = lower_column(data)
    grouper = compute(
        expr.grouper,
        scope,
        post_compute=False,
        return_type='native',
        **kwargs
    )

    app = expr.apply
    reductions = [
        compute(
            val,
            data,
            post_compute=None,
            return_type='native',
        ).label(name)
        for val, name in zip(app.values, app.fields)
    ]

    froms = list(unique(chain(get_all_froms(grouper),
                              concat(map(get_all_froms, reductions)))))
    inner_cols = list(getattr(grouper, 'inner_columns', [grouper]))
    grouper_cols = inner_cols[:]
    inner_cols.extend(concat(
        getattr(getattr(r, 'element', None), 'inner_columns', [r])
        for r in reductions
    ))
    wheres = unify_wheres([grouper] + reductions)
    sel = unify_froms(sa.select(inner_cols, whereclause=wheres), froms)
    return sel.group_by(*grouper_cols)
Exemple #13
0
def compute_up(expr, data, scope=None, **kwargs):
    data = lower_column(data)
    grouper = compute(
        expr.grouper,
        scope,
        post_compute=False,
        return_type='native',
        **kwargs
    )

    app = expr.apply
    reductions = [
        compute(
            val,
            data,
            post_compute=None,
            return_type='native',
        ).label(name)
        for val, name in zip(app.values, app.fields)
    ]

    froms = list(unique(chain(get_all_froms(grouper),
                              concat(map(get_all_froms, reductions)))))
    inner_cols = list(getattr(grouper, 'inner_columns', [grouper]))
    grouper_cols = inner_cols[:]
    inner_cols.extend(concat(
        getattr(getattr(r, 'element', None), 'inner_columns', [r])
        for r in reductions
    ))
    wheres = unify_wheres([grouper] + reductions)
    sel = unify_froms(sa.select(inner_cols, whereclause=wheres), froms)
    return sel.group_by(*grouper_cols)
Exemple #14
0
def compute_up(expr, data, **kwargs):
    if not valid_grouper(expr.grouper):
        raise TypeError("Grouper must have a non-nested record or one "
                        "dimensional collection datashape, "
                        "got %s of type %r with dshape %s" %
                        (expr.grouper, type(expr.grouper).__name__,
                         expr.grouper.dshape))
    grouper = get_inner_columns(
        compute(
            expr.grouper,
            data,
            post_compute=False,
            return_type='native',
        ),
    )
    app = expr.apply
    reductions = [
        compute(
            val,
            data,
            post_compute=False,
            return_type='native',
        ).label(name)
        for val, name in zip(app.values, app.fields)
    ]

    return sa.select(grouper + reductions).group_by(*grouper)
Exemple #15
0
def interpose(el, seq):
    """ Introduce element between each pair of elements in seq

    >>> list(interpose("a", [1, 2, 3]))
    [1, 'a', 2, 'a', 3]
    """
    combined = zip(itertools.repeat(el), seq)
    return drop(1, concat(combined))
Exemple #16
0
def interpose(el, seq):
    """ Introduce element between each pair of elements in seq

    >>> list(interpose("a", [1, 2, 3]))
    [1, 'a', 2, 'a', 3]
    """
    combined = zip(itertools.repeat(el), seq)
    return drop(1, concat(combined))
Exemple #17
0
def compute_up(t, s, **kwargs):
    scope = {t._child: s}
    return sa.select(
        compute(
            value,
            scope,
            post_compute=None,
            return_type='native',
        ).label(name) for value, name in zip(t.values, t.fields))
Exemple #18
0
def interpose(el, seq):
    """ Introduce element between each pair of elements in seq

    >>> list(interpose("a", [1, 2, 3]))
    [1, 'a', 2, 'a', 3]
    """
    inposed = concat(zip(itertools.repeat(el), seq))
    next(inposed)
    return inposed
Exemple #19
0
def compute_up(expr, data, **kwargs):
    grouper = lower_column(data)
    app = expr.apply
    if isinstance(app, Reduction):
        reductions = [compute(app, data, post_compute=False)]
    elif isinstance(app, Summary):
        reductions = [compute(val, data, post_compute=None).label(name)
                      for val, name in zip(app.values, app.fields)]

    return sa.select([grouper] + reductions).group_by(grouper)
Exemple #20
0
def keymap(func, d):
    """ Apply function to keys of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> keymap(str.lower, bills)  # doctest: +SKIP
    {'alice': [20, 15, 30], 'bob': [10, 35]}

    See Also:
        valmap
    """
    return dict(zip(map(func, iterkeys(d)), itervalues(d)))
Exemple #21
0
def keymap(func, d):
    """ Apply function to keys of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> keymap(str.lower, bills)  # doctest: +SKIP
    {'alice': [20, 15, 30], 'bob': [10, 35]}

    See Also:
        valmap
    """
    return dict(zip(map(func, iterkeys(d)), itervalues(d)))
Exemple #22
0
def compute_up(t, s, **kwargs):
    scope = {t._child: s}
    return sa.select(
        compute(
            value,
            scope,
            post_compute=None,
            return_type='native',
        ).label(name)
        for value, name in zip(t.values, t.fields)
    )
Exemple #23
0
def valmap(func, d):
    """ Apply function to values of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> valmap(sum, bills)  # doctest: +SKIP
    {'Alice': 65, 'Bob': 45}

    See Also:
        keymap
    """
    return dict(zip(iterkeys(d), map(func, itervalues(d))))
Exemple #24
0
def valmap(func, d):
    """ Apply function to values of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> valmap(sum, bills)  # doctest: +SKIP
    {'Alice': 65, 'Bob': 45}

    See Also:
        keymap
    """
    return dict(zip(iterkeys(d), map(func, itervalues(d))))
Exemple #25
0
def compute_up(expr, data, **kwargs):
    grouper = lower_column(data)
    app = expr.apply
    if isinstance(app, Reduction):
        reductions = [compute(app, data, post_compute=False)]
    elif isinstance(app, Summary):
        reductions = [
            compute(val, data, post_compute=None).label(name)
            for val, name in zip(app.values, app.fields)
        ]

    return sa.select([grouper] + reductions).group_by(grouper)
Exemple #26
0
def compute_up(t, s, scope=None, **kwargs):
    d = dict((t._child[c], list(inner_columns(s))[i])
             for i, c in enumerate(t._child.fields))

    cols = [compute(val, toolz.merge(scope, d), post_compute=None).label(name)
            for name, val in zip(t.fields, t.values)]

    s = copy(s)
    for c in cols:
        s.append_column(c)

    return s.with_only_columns(cols)
Exemple #27
0
def compute_up(t, s, scope=None, **kwargs):
    d = dict((t._child[c], list(inner_columns(s))[i])
             for i, c in enumerate(t._child.fields))

    cols = [compute(val, toolz.merge(scope, d), post_compute=None).label(name)
            for name, val in zip(t.fields, t.values)]

    s = copy(s)
    for c in cols:
        s.append_column(c)

    return s.with_only_columns(cols)
Exemple #28
0
def keymap(func, d, factory=dict):
    """ Apply function to keys of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> keymap(str.lower, bills)  # doctest: +SKIP
    {'alice': [20, 15, 30], 'bob': [10, 35]}

    See Also:
        valmap
        itemmap
    """
    rv = factory()
    rv.update(zip(map(func, iterkeys(d)), itervalues(d)))
    return rv
Exemple #29
0
def keymap(func, d, factory=dict):
    """ Apply function to keys of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> keymap(str.lower, bills)  # doctest: +SKIP
    {'alice': [20, 15, 30], 'bob': [10, 35]}

    See Also:
        valmap
        itemmap
    """
    rv = factory()
    rv.update(zip(map(func, iterkeys(d)), itervalues(d)))
    return rv
Exemple #30
0
def valmap(func, d, factory=dict):
    """ Apply function to values of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> valmap(sum, bills)  # doctest: +SKIP
    {'Alice': 65, 'Bob': 45}

    See Also:
        keymap
        itemmap
    """
    rv = factory()
    rv.update(zip(iterkeys(d), map(func, itervalues(d))))
    return rv
Exemple #31
0
def valmap(func, d, factory=dict):
    """ Apply function to values of dictionary

    >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]}
    >>> valmap(sum, bills)  # doctest: +SKIP
    {'Alice': 65, 'Bob': 45}

    See Also:
        keymap
        itemmap
    """
    rv = factory()
    rv.update(zip(iterkeys(d), map(func, itervalues(d))))
    return rv
Exemple #32
0
def sliding_window(n, seq):
    """ A sequence of overlapping subsequences

    >>> list(sliding_window(2, [1, 2, 3, 4]))
    [(1, 2), (2, 3), (3, 4)]

    This function creates a sliding window suitable for transformations like
    sliding means / smoothing

    >>> mean = lambda seq: float(sum(seq)) / len(seq)
    >>> list(map(mean, sliding_window(2, [1, 2, 3, 4])))
    [1.5, 2.5, 3.5]
    """
    return zip(*(collections.deque(itertools.islice(it, i), 0) or it
                 for i, it in enumerate(itertools.tee(seq, n))))
Exemple #33
0
def sliding_window(n, seq):
    """ A sequence of overlapping subsequences

    >>> list(sliding_window(2, [1, 2, 3, 4]))
    [(1, 2), (2, 3), (3, 4)]

    This function creates a sliding window suitable for transformations like
    sliding means / smoothing

    >>> mean = lambda seq: float(sum(seq)) / len(seq)
    >>> list(map(mean, sliding_window(2, [1, 2, 3, 4])))
    [1.5, 2.5, 3.5]
    """
    return zip(*(collections.deque(itertools.islice(it, i), 0) or it
               for i, it in enumerate(itertools.tee(seq, n))))
Exemple #34
0
def load_castra_partition(castra, part, columns, index):
    import blosc
    # Due to serialization issues, blosc needs to be manually initialized in
    # each process.
    blosc.init()

    df = castra.load_partition(part, columns)
    if isinstance(columns, list):
        items = df.itertuples(index)
    else:
        items = df.iteritems() if index else iter(df)

    items = list(items)
    if items and isinstance(items[0], tuple) and type(items[0]) is not tuple:
        names = items[0]._fields
        items = [dict(zip(names, item)) for item in items]

    return items
Exemple #35
0
def load_castra_partition(castra, part, columns, index):
    import blosc
    # Due to serialization issues, blosc needs to be manually initialized in
    # each process.
    blosc.init()

    df = castra.load_partition(part, columns)
    if isinstance(columns, list):
        items = df.itertuples(index)
    else:
        items = df.iteritems() if index else iter(df)

    items = list(items)
    if items and isinstance(items[0], tuple) and type(items[0]) is not tuple:
        names = items[0]._fields
        items = [dict(zip(names, item)) for item in items]

    return items
Exemple #36
0
def diff(*seqs, **kwargs):
    """ Return those items that differ between sequences

    >>> list(diff([1, 2, 3], [1, 2, 10, 100]))
    [(3, 10)]

    Shorter sequences may be padded with a ``default`` value:

    >>> list(diff([1, 2, 3], [1, 2, 10, 100], default=None))
    [(3, 10), (None, 100)]

    A ``key`` function may also be applied to each item to use during
    comparisons:

    >>> list(diff(['apples', 'bananas'], ['Apples', 'Oranges'], key=str.lower))
    [('bananas', 'Oranges')]
    """
    N = len(seqs)
    if N == 1 and isinstance(seqs[0], list):
        seqs = seqs[0]
        N = len(seqs)
    if N < 2:
        raise TypeError('Too few sequences given (min 2 required)')
    default = kwargs.get('default', no_default)
    if default == no_default:
        iters = zip(*seqs)
    else:
        iters = zip_longest(*seqs, fillvalue=default)
    key = kwargs.get('key', None)
    if key is None:
        for items in iters:
            if items.count(items[0]) != N:
                yield items
    else:
        for items in iters:
            vals = tuple(map(key, items))
            if vals.count(vals[0]) != N:
                yield items
Exemple #37
0
def diff(*seqs, **kwargs):
    """ Return those items that differ between sequences

    >>> list(diff([1, 2, 3], [1, 2, 10, 100]))
    [(3, 10)]

    Shorter sequences may be padded with a ``default`` value:

    >>> list(diff([1, 2, 3], [1, 2, 10, 100], default=None))
    [(3, 10), (None, 100)]

    A ``key`` function may also be applied to each item to use during
    comparisons:

    >>> list(diff(['apples', 'bananas'], ['Apples', 'Oranges'], key=str.lower))
    [('bananas', 'Oranges')]
    """
    N = len(seqs)
    if N == 1 and isinstance(seqs[0], list):
        seqs = seqs[0]
        N = len(seqs)
    if N < 2:
        raise TypeError('Too few sequences given (min 2 required)')
    default = kwargs.get('default', no_default)
    if default == no_default:
        iters = zip(*seqs)
    else:
        iters = zip_longest(*seqs, fillvalue=default)
    key = kwargs.get('key', None)
    if key is None:
        for items in iters:
            if items.count(items[0]) != N:
                yield items
    else:
        for items in iters:
            vals = tuple(map(key, items))
            if vals.count(vals[0]) != N:
                yield items
Exemple #38
0
def partition(n, seq, pad=no_pad):
    """ Partition sequence into tuples of length n

    >>> list(partition(2, [1, 2, 3, 4]))
    [(1, 2), (3, 4)]

    If the length of ``seq`` is not evenly divisible by ``n``, the final tuple
    is dropped if ``pad`` is not specified, or filled to length ``n`` by pad:

    >>> list(partition(2, [1, 2, 3, 4, 5]))
    [(1, 2), (3, 4)]

    >>> list(partition(2, [1, 2, 3, 4, 5], pad=None))
    [(1, 2), (3, 4), (5, None)]

    See Also:
        partition_all
    """
    args = [iter(seq)] * n
    if pad is no_pad:
        return zip(*args)
    else:
        return zip_longest(*args, fillvalue=pad)
Exemple #39
0
def partition(n, seq, pad=no_pad):
    """ Partition sequence into tuples of length n

    >>> list(partition(2, [1, 2, 3, 4]))
    [(1, 2), (3, 4)]

    If the length of ``seq`` is not evenly divisible by ``n``, the final tuple
    is dropped if ``pad`` is not specified, or filled to length ``n`` by pad:

    >>> list(partition(2, [1, 2, 3, 4, 5]))
    [(1, 2), (3, 4)]

    >>> list(partition(2, [1, 2, 3, 4, 5], pad=None))
    [(1, 2), (3, 4), (5, None)]

    See Also:
        partition_all
    """
    args = [iter(seq)] * n
    if pad is no_pad:
        return zip(*args)
    else:
        return zip_longest(*args, fillvalue=pad)
Exemple #40
0
 def mean_aggregate(x):
     totals, counts = list(zip(*x))
     return 1.0 * sum(totals) / sum(counts)
Exemple #41
0
 def var_aggregate(x):
     squares, totals, counts = list(zip(*x))
     x2, x, n = float(sum(squares)), float(sum(totals)), sum(counts)
     result = (x2 / n) - (x / n)**2
     return result * n / (n - ddof)
Exemple #42
0
 def var_aggregate(x):
     squares, totals, counts = list(zip(*x))
     x2, x, n = float(sum(squares)), float(sum(totals)), sum(counts)
     result = (x2 / n) - (x / n)**2
     return result * n / (n - ddof)
Exemple #43
0
 def mean_aggregate(x):
     totals, counts = list(zip(*x))
     return 1.0 * sum(totals) / sum(counts)
Exemple #44
0
def compute(t, seq):
    seq1, seq2 = itertools.tee(seq)
    parent = compute(t.parent, seq1)
    predicate = compute(t.predicate, seq2)
    return (x for x, tf in zip(parent, predicate)
              if tf)