def test_EqualityHashKey_default_key(): EqualityHashDefault = curry(EqualityHashKey, None) L1 = [1] L2 = [2] data1 = [L1, L1, L2, [], [], [1], [2], {}, ()] set1 = set(map(EqualityHashDefault, data1)) set2 = set(map(EqualityHashDefault, [[], [1], [2], {}, ()])) assert set1 == set2 assert len(set1) == 5 # Test that ``EqualityHashDefault(item)`` is distinct from ``item`` T0 = () T1 = (1,) data2 = list(map(EqualityHashDefault, [T0, T0, T1, T1, (), (1,)])) data2.extend([T0, T1, (), (1,)]) set3 = set(data2) assert set3 == set([(), (1,), EqualityHashDefault(()), EqualityHashDefault((1,))]) assert len(set3) == 4 assert EqualityHashDefault(()) in set3 assert EqualityHashDefault((1,)) in set3 # Miscellaneous E1 = EqualityHashDefault(L1) E2 = EqualityHashDefault(L2) assert str(E1) == '=[1]=' assert repr(E1) == '=[1]=' assert E1 != E2 assert not (E1 == E2) assert E1 == EqualityHashDefault(L1) assert not (E1 != EqualityHashDefault(L1)) assert E1 != L1 assert not (E1 == L1)
def pluck(ind, seqs, default=no_default): """ plucks an element or several elements from each item in a sequence. ``pluck`` maps ``itertoolz.get`` over a sequence and returns one or more elements of each item in the sequence. This is equivalent to running `map(curried.get(ind), seqs)` ``ind`` can be either a single string/index or a sequence of strings/indices. ``seqs`` should be sequence containing sequences or dicts. e.g. >>> data = [{'id': 1, 'name': 'Cheese'}, {'id': 2, 'name': 'Pies'}] >>> list(pluck('name', data)) ['Cheese', 'Pies'] >>> list(pluck([0, 1], [[1, 2, 3], [4, 5, 7]])) [(1, 2), (4, 5)] See Also: get map """ if default is no_default: if isinstance(ind, list): return map(operator.itemgetter(*ind), seqs) return map(operator.itemgetter(ind), seqs) elif isinstance(ind, list): return (tuple(_get(item, seq, default) for item in ind) for seq in seqs) return (_get(ind, seq, default) for seq in seqs)
def test_EqualityHashKey_default_key(): EqualityHashDefault = curry(EqualityHashKey, None) L1 = [1] L2 = [2] data1 = [L1, L1, L2, [], [], [1], [2], {}, ()] set1 = set(map(EqualityHashDefault, data1)) set2 = set(map(EqualityHashDefault, [[], [1], [2], {}, ()])) assert set1 == set2 assert len(set1) == 5 # Test that ``EqualityHashDefault(item)`` is distinct from ``item`` T0 = () T1 = (1, ) data2 = list(map(EqualityHashDefault, [T0, T0, T1, T1, (), (1, )])) data2.extend([T0, T1, (), (1, )]) set3 = set(data2) assert set3 == set([(), (1, ), EqualityHashDefault(()), EqualityHashDefault((1, ))]) assert len(set3) == 4 assert EqualityHashDefault(()) in set3 assert EqualityHashDefault((1, )) in set3 # Miscellaneous E1 = EqualityHashDefault(L1) E2 = EqualityHashDefault(L2) assert str(E1) == '=[1]=' assert repr(E1) == '=[1]=' assert E1 != E2 assert not (E1 == E2) assert E1 == EqualityHashDefault(L1) assert not (E1 != EqualityHashDefault(L1)) assert E1 != L1 assert not (E1 == L1)
def fold(binop, seq, default=no_default, map=map, chunksize=128, combine=None): """ Reduce without guarantee of ordered reduction. inputs: ``binop`` - associative operator. The associative property allows us to leverage a parallel map to perform reductions in parallel. ``seq`` - a sequence to be aggregated ``default`` - an identity element like 0 for ``add`` or 1 for mul ``map`` - an implementation of ``map``. This may be parallel and determines how work is distributed. ``chunksize`` - Number of elements of ``seq`` that should be handled within a single function call ``combine`` - Binary operator to combine two intermediate results. If ``binop`` is of type (total, item) -> total then ``combine`` is of type (total, total) -> total Defaults to ``binop`` for common case of operators like add Fold chunks up the collection into blocks of size ``chunksize`` and then feeds each of these to calls to ``reduce``. This work is distributed with a call to ``map``, gathered back and then refolded to finish the computation. In this way ``fold`` specifies only how to chunk up data but leaves the distribution of this work to an externally provided ``map`` function. This function can be sequential or rely on multithreading, multiprocessing, or even distributed solutions. If ``map`` intends to serialize functions it should be prepared to accept and serialize lambdas. Note that the standard ``pickle`` module fails here. Example ------- >>> # Provide a parallel map to accomplish a parallel sum >>> from operator import add >>> fold(add, [1, 2, 3, 4], chunksize=2, map=map) 10 """ if combine is None: combine = binop chunks = partition_all(chunksize, seq) # Evaluate sequence in chunks via map if default is no_default: results = map(lambda chunk: reduce(binop, chunk), chunks) else: results = map(lambda chunk: reduce(binop, chunk, default), chunks) results = list(results) # TODO: Support complete laziness if len(results) == 1: # Return completed result return results[0] else: # Recurse to reaggregate intermediate results return fold(combine, results, map=map, chunksize=chunksize)
def multihash(x): try: return hash(x) except TypeError: if isinstance(x, (list, tuple, set, frozenset)): return hash(tuple(map(multihash, x))) if type(x) is dict: return hash(frozenset(map(multihash, x.items()))) if type(x) is slice: return hash((x.start, x.stop, x.step)) raise TypeError("Hashing not covered for " + str(x))
def multihash(x): try: return hash(x) except TypeError: if isinstance(x, (list, tuple, set, frozenset)): return hash(tuple(map(multihash, x))) if type(x) is dict: return hash(frozenset(map(multihash, x.items()))) if type(x) is slice: return hash((x.start, x.stop, x.step)) raise TypeError('Hashing not covered for ' + str(x))
def test_load_from_dir_of_jsonlines(ctx): dfs = [] dfc = df.copy() for i in range(3): dfc['id'] += i dfs.append(dfc.copy()) expected = pd.concat(dfs, axis=0, ignore_index=True) with jslines() as d: result = odo(Directory(JSONLines)(d), ctx) assert (set(map(frozenset, odo(result, list))) == set(map(frozenset, odo(expected, list))))
def _to_lists(seq, n=10): """iter of iters -> finite list of finite lists """ def initial(s): return list(take(n, s)) return initial(map(initial, seq))
def merge_sorted(*iters, **kwargs): """ Merge and sort a collection of sorted collections >>> list(merge_sorted([1, 3, 5], [2, 4, 6])) [1, 2, 3, 4, 5, 6] >>> ''.join(merge_sorted('abc', 'abc', 'abc')) 'aaabbbccc' """ key = kwargs.get('key', identity) iters = map(iter, iters) pq = Queue.PriorityQueue() def inject_first_element(it, tiebreaker=None): try: item = next(it) pq.put((key(item), item, tiebreaker, it)) except StopIteration: pass # Initial population for i, it in enumerate(iters): inject_first_element(it, i) # Repeatedly yield and then repopulate from the same iterator while not pq.empty(): _, item, tb, it = pq.get() yield item inject_first_element(it, tb)
def attr_pluck(attr, objs, default=no_default): """ plucks an attribute or several attributes from each object in a sequence. ``attr_pluck`` maps ``itertoolz.attr_get`` over a sequence and returns one or more attributes of each object in the sequence. This is equivalent to running `map(curried.attr_get(attr), objs)` ``attr`` can be either a single string/index or a list of strings. ``objs`` should be sequence containing object. e.g. >>> class A(object): >>> pass >>> a1 = A(); a1.id = 1, a1.name = "Cheese" >>> a2 = A(); a2.id = 2, a2.name = "Pies" >>> list(attr_pluck('name', [a1, a2])) ['Cheese', 'Pies'] See Also: attr_get map """ if default == no_default: get = attr_getter(attr) return map(get, objs) elif isinstance(attr, list): return (tuple(getattr(obj, item, default) for item in attr) for obj in objs) return (getattr(obj, attr, default) for obj in objs)
def mapcat(func, seqs): """ Apply func to each sequence in seqs, concatenating results. >>> list(mapcat(lambda s: [c.upper() for c in s], ... [["a", "b"], ["c", "d", "e"]])) ['A', 'B', 'C', 'D', 'E'] """ return concat(map(func, seqs))
def merge_sorted(*seqs, **kwargs): """ Merge and sort a collection of sorted collections This works lazily and only keeps one value from each iterable in memory. >>> list(merge_sorted([1, 3, 5], [2, 4, 6])) [1, 2, 3, 4, 5, 6] >>> ''.join(merge_sorted('abc', 'abc', 'abc')) 'aaabbbccc' The "key" function used to sort the input may be passed as a keyword. >>> list(merge_sorted([2, 3], [1, 3], key=lambda x: x // 3)) [2, 1, 3, 3] """ key = kwargs.get('key', None) if key is None: # heapq.merge does what we do below except by val instead of key(val) for item in heapq.merge(*seqs): yield item else: # The commented code below shows an alternative (slower) implementation # to apply a key function for sorting. # # mapper = lambda i, item: (key(item), i, item) # keyiters = [map(partial(mapper, i), itr) for i, itr in # enumerate(seqs)] # return (item for (item_key, i, item) in heapq.merge(*keyiters)) # binary heap as a priority queue pq = [] # Initial population for itnum, it in enumerate(map(iter, seqs)): try: item = next(it) pq.append([key(item), itnum, item, it]) except StopIteration: pass heapq.heapify(pq) # Repeatedly yield and then repopulate from the same iterator while True: try: while True: # raises IndexError when pq is empty _, itnum, item, it = s = pq[0] yield item item = next(it) # raises StopIteration when exhausted s[0] = key(item) s[2] = item heapq.heapreplace(pq, s) # restore heap condition except StopIteration: heapq.heappop(pq) # remove empty iterator except IndexError: return
def pprint(g): """ Pretty print a tree of goals """ if callable(g) and hasattr(g, "__name__"): return g.__name__ if isinstance(g, type): # pragma: no cover return g.__name__ if isinstance(g, tuple): return "(" + ", ".join(map(pprint, g)) + ")" return str(g)
def pprint(g): """ Pretty print a tree of goals """ if callable(g) and hasattr(g, '__name__'): return g.__name__ if isinstance(g, type): return g.__name__ if isinstance(g, tuple): return "(" + ', '.join(map(pprint, g)) + ")" return str(g)
def pprint(g): """ Pretty print a tree of goals """ if callable(g) and hasattr(g, '__name__'): return g.__name__ if isinstance(g, type): # pragma: no cover return g.__name__ if isinstance(g, tuple): return "(" + ', '.join(map(pprint, g)) + ")" return str(g)
def freeze(d): """ Freeze container to hashable form >>> freeze(1) 1 >>> freeze([1, 2]) (1, 2) >>> freeze({1: 2}) # doctest: +SKIP frozenset([(1, 2)]) """ if isinstance(d, dict): return frozenset(map(freeze, d.items())) if isinstance(d, set): return frozenset(map(freeze, d)) if isinstance(d, (tuple, list)): return tuple(map(freeze, d)) return d
def keymap(func, d): """ Apply function to keys of dictionary >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]} >>> keymap(str.lower, bills) # doctest: +SKIP {'alice': [20, 15, 30], 'bob': [10, 35]} See Also: valmap """ return dict(zip(map(func, iterkeys(d)), itervalues(d)))
def valmap(func, d): """ Apply function to values of dictionary >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]} >>> valmap(sum, bills) # doctest: +SKIP {'Alice': 65, 'Bob': 45} See Also: keymap """ return dict(zip(iterkeys(d), map(func, itervalues(d))))
def interleave(seqs, pass_exceptions=()): iters = map(iter, seqs) while iters: newiters = [] for itr in iters: try: yield next(itr) newiters.append(itr) except (StopIteration, ) + tuple(pass_exceptions): pass iters = newiters
def interleave(seqs, pass_exceptions=()): iters = map(iter, seqs) while iters: newiters = [] for itr in iters: try: yield next(itr) newiters.append(itr) except (StopIteration,) + tuple(pass_exceptions): pass iters = newiters
def test_append_spark_df_to_json_lines(ctx): out = os.linesep.join(map(json.dumps, df.to_dict('records'))) sdf = ctx.table('t') expected = pd.concat([df, df]).sort('amount').reset_index(drop=True).sort_index(axis=1) with tmpfile('.json') as fn: with open(fn, mode='wb') as f: f.write(out + os.linesep) uri = 'jsonlines://%s' % fn odo(sdf, uri) result = odo(uri, pd.DataFrame).sort('amount').reset_index(drop=True).sort_index(axis=1) tm.assert_frame_equal(result, expected)
def test_append_spark_df_to_json_lines(ctx): out = os.linesep.join(map(json.dumps, df.to_dict('records'))) sdf = ctx.table('t') expected = pd.concat([df, df]).sort('amount').reset_index(drop=True).sort_index(axis=1) with tmpfile('.json') as fn: with open(fn, mode='w') as f: f.write(out + os.linesep) uri = 'jsonlines://%s' % fn odo(sdf, uri) result = odo(uri, pd.DataFrame).sort('amount').reset_index(drop=True).sort_index(axis=1) tm.assert_frame_equal(result, expected)
def itemmap(func, d): """ Apply function to items of dictionary >>> accountids = {"Alice": 10, "Bob": 20} >>> itemmap(reversed, accountids) # doctest: +SKIP {10: "Alice", 20: "Bob"} See Also: keymap valmap """ return dict(map(func, iteritems(d)))
def itemmap(func, d, factory=dict): """ Apply function to items of dictionary >>> accountids = {"Alice": 10, "Bob": 20} >>> itemmap(reversed, accountids) # doctest: +SKIP {10: "Alice", 20: "Bob"} See Also: keymap valmap """ rv = factory() rv.update(map(func, iteritems(d))) return rv
def valmap(func, d, factory=dict): """ Apply function to values of dictionary >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]} >>> valmap(sum, bills) # doctest: +SKIP {'Alice': 65, 'Bob': 45} See Also: keymap itemmap """ rv = factory() rv.update(zip(iterkeys(d), map(func, itervalues(d)))) return rv
def keymap(func, d, factory=dict): """ Apply function to keys of dictionary >>> bills = {"Alice": [20, 15, 30], "Bob": [10, 35]} >>> keymap(str.lower, bills) # doctest: +SKIP {'alice': [20, 15, 30], 'bob': [10, 35]} See Also: valmap itemmap """ rv = factory() rv.update(zip(map(func, iterkeys(d)), itervalues(d))) return rv
def deep_transitive_get(key, d): """ Transitive get that propagates within tuples >>> from logpy.util import transitive_get, deep_transitive_get >>> d = {1: (2, 3), 2: 12, 3: 13} >>> transitive_get(1, d) (2, 3) >>> deep_transitive_get(1, d) (12, 13) """ key = transitive_get(key, d) if isinstance(key, tuple): return tuple(map(lambda k: deep_transitive_get(k, d), key)) else: return key
def _merge_sorted_key(seqs, key): # The commented code below shows an alternative (slower) implementation # to apply a key function for sorting. # # mapper = lambda i, item: (key(item), i, item) # keyiters = [map(partial(mapper, i), itr) for i, itr in # enumerate(seqs)] # return (item for (item_key, i, item) in heapq.merge(*keyiters)) # binary heap as a priority queue pq = [] # Initial population for itnum, it in enumerate(map(iter, seqs)): try: item = next(it) pq.append([key(item), itnum, item, it]) except StopIteration: pass heapq.heapify(pq) # Repeatedly yield and then repopulate from the same iterator heapreplace = heapq.heapreplace heappop = heapq.heappop while len(pq) > 1: try: while True: # raises IndexError when pq is empty _, itnum, item, it = s = pq[0] yield item item = next(it) # raises StopIteration when exhausted s[0] = key(item) s[2] = item heapreplace(pq, s) # restore heap condition except StopIteration: heappop(pq) # remove empty iterator if pq: # Much faster when only a single iterable remains _, itnum, item, it = pq[0] yield item for item in it: yield item
def interleave(seqs): """ Interleave a sequence of sequences >>> list(interleave([[1, 2], [3, 4]])) [1, 3, 2, 4] >>> ''.join(interleave(('ABC', 'XY'))) 'AXBYC' Both the individual sequences and the sequence of sequences may be infinite Returns a lazy iterator """ iters = itertools.cycle(map(iter, seqs)) while True: try: for itr in iters: yield next(itr) return except StopIteration: predicate = partial(operator.is_not, itr) iters = itertools.cycle(itertools.takewhile(predicate, iters))
def diff(*seqs, **kwargs): """ Return those items that differ between sequences >>> list(diff([1, 2, 3], [1, 2, 10, 100])) [(3, 10)] Shorter sequences may be padded with a ``default`` value: >>> list(diff([1, 2, 3], [1, 2, 10, 100], default=None)) [(3, 10), (None, 100)] A ``key`` function may also be applied to each item to use during comparisons: >>> list(diff(['apples', 'bananas'], ['Apples', 'Oranges'], key=str.lower)) [('bananas', 'Oranges')] """ N = len(seqs) if N == 1 and isinstance(seqs[0], list): seqs = seqs[0] N = len(seqs) if N < 2: raise TypeError('Too few sequences given (min 2 required)') default = kwargs.get('default', no_default) if default == no_default: iters = zip(*seqs) else: iters = zip_longest(*seqs, fillvalue=default) key = kwargs.get('key', None) if key is None: for items in iters: if items.count(items[0]) != N: yield items else: for items in iters: vals = tuple(map(key, items)) if vals.count(vals[0]) != N: yield items
def interleave(seqs, pass_exceptions=()): """ Interleave a sequence of sequences >>> list(interleave([[1, 2], [3, 4]])) [1, 3, 2, 4] >>> ''.join(interleave(('ABC', 'XY'))) 'AXBYC' Both the individual sequences and the sequence of sequences may be infinite Returns a lazy iterator """ iters = map(iter, seqs) while iters: newiters = [] for itr in iters: try: yield next(itr) newiters.append(itr) except (StopIteration,) + tuple(pass_exceptions): pass iters = newiters
def interleave(seqs, pass_exceptions=()): """ Interleave a sequence of sequences >>> list(interleave([[1, 2], [3, 4]])) [1, 3, 2, 4] >>> ''.join(interleave(('ABC', 'XY'))) 'AXBYC' Both the individual sequences and the sequence of sequences may be infinite Returns a lazy iterator """ iters = map(iter, seqs) while iters: newiters = [] for itr in iters: try: yield next(itr) newiters.append(itr) except (StopIteration, ) + tuple(pass_exceptions): pass iters = newiters
def test_pyspark_to_sparksql(ctx, people): sdf = odo(data, ctx, dshape=discover(df)) assert isinstance(sdf, (SparkDataFrame, SchemaRDD)) assert (list(map(set, odo(people, list))) == list(map(set, odo(sdf, list))))
def test_load_from_jsonlines(ctx): with tmpfile('.json') as fn: js = odo(df, 'jsonlines://%s' % fn) result = odo(js, ctx, name='r') assert (list(map(set, odo(result, list))) == list(map(set, odo(df, list))))
def test_map_filter_are_lazy(): def bad(x): raise Exception() map(bad, [1, 2, 3]) filter(bad, [1, 2, 3])
def _reify(t, s): return map(partial(reify, s=s), t)
def test_reduction_to_scalar(ctx): result = odo(ctx.sql('select sum(amount) from t'), float) assert isinstance(result, float) assert result == sum(map(toolz.second, data))