def test_group_by_reducer_clones(): data = [ { "value": 2 }, { "value": 3 }, ] conv = c.aggregate( c.item("value").pipe(c.ReduceFuncs.Sum(c.this()).pipe(c.this() + 1))) assert conv.execute(data) == 6 reducer = c.ReduceFuncs.DictSum(c.item("k"), c.item("v")) reducer1 = c.item("item1").pipe(reducer) reducer2 = c.item("item2").pipe(reducer) assert c.aggregate(reducer1).execute([{ "item1": { "k": 1, "v": 2 } }]) == { 1: 2 } assert c.aggregate(reducer2).execute([{ "item2": { "k": 2, "v": 3 } }]) == { 2: 3 }
def test_multi_statement_reducers(dict_series): output = (c.group_by(c.item("name")).aggregate(( c.item("name"), SumReducer1(c.item("value")), SumReducer2(c.item("value")), SumReducer3(c.item("value")), SumReducer4(c.item("value")), SumReducer5(c.item("value"), initial=5), )).execute(dict_series, debug=False)) assert output == [("Nick", 3, 3, 3, 3, 8), ("John", 63, 63, 63, 63, 68)] with pytest.raises(AttributeError): class SumReducer(MultiStatementReducer): reduce = ("%(result)s = %(result)s + ({0} or 4)", ) default = 0 unconditional_init = True # prepare_first is not specified c.aggregate(SumReducer(c.item("value"))).gen_converter() with pytest.raises(ValueError): class SumReducer(MultiStatementReducer): reduce = ("%(result)s = %(result)s + ({0} or 4)", ) unconditional_init = True # default is not provided SumReducer(c.item("value"))
def test_multiple_aggregations(dict_series): assert ( c.aggregate(c.ReduceFuncs.Array(c.item("name"))) .pipe( c.aggregate(c.ReduceFuncs.ArrayDistinct(c.this())).pipe( c.aggregate(c.ReduceFuncs.Max(c.this())) ) ) .execute(dict_series, debug=False) == "Nick" )
def test_base_reducer(): assert c.aggregate(( c.reduce(lambda a, b: a + b, c.this, initial=0), c.reduce(c.naive(lambda a, b: a + b), c.this, initial=int), c.reduce( c.inline_expr("{0} + {1}"), c.this, initial=c.inline_expr("int()"), default=0, ), c.reduce( c.inline_expr("{0} + {1}"), c.this, initial=c(int), default=0, ), c.reduce( c.inline_expr("{0} + {1}"), c.this, initial=int, default=0, ), )).filter(c.this > 5).gen_converter(debug=False)([1, 2, 3]) == [ 6, 6, 6, 6, 6, ] with pytest.raises(ValueError): c.aggregate(c.ReduceFuncs.Sum(c.reduce( c.ReduceFuncs.Count))).gen_converter() with pytest.raises(ValueError): c.aggregate(c.ReduceFuncs.Sum(c.ReduceFuncs.Count() + 1)).gen_converter() with pytest.raises(ValueError): c.aggregate((c.ReduceFuncs.Count() + 2).pipe(c.ReduceFuncs.Sum(c.this) + 1)).gen_converter() conv = c.aggregate(c.ReduceFuncs.DictArray( c.item(0), c.item(1))).gen_converter(debug=False) data = [ ("a", 1), ("a", 2), ("b", 3), ] result = {"a": [1, 2], "b": [3]} assert conv(data) == result assert conv([]) is None conv2 = c.aggregate({ "key": c.ReduceFuncs.DictArray(c.item(0), c.item(1)) }).gen_converter(debug=False) assert conv2([]) == {"key": None} assert conv2(data) == {"key": result}
def test_aggregate_no_init_loops(): converter = c.aggregate({ "first_a": c.ReduceFuncs.First(c.item("a"), where=c.item("b") > 0), "list_b": c.ReduceFuncs.Array(c.item("b"), where=c.item("a") > 0), }).gen_converter(debug=False) assert converter([ { "a": 1, "b": 0 }, { "a": 2, "b": 1 }, { "a": 3, "b": 2 }, { "a": 4, "b": 3 }, ], ) == { "first_a": 2, "list_b": [0, 1, 2, 3], }
def test_aggregate_func(): input_data = [ { "a": 5, "b": "foo" }, { "a": 10, "b": "bar" }, { "a": 10, "b": "bar" }, ] conv = c.aggregate({ "a": c.ReduceFuncs.Array(c.item("a")), "ab_sum": c.ReduceFuncs.Sum(c.item("a")) + c.ReduceFuncs.Count(), "b": c.ReduceFuncs.ArrayDistinct(c.item("b")), "b_max_a": c.ReduceFuncs.MaxRow(c.item("a")).item("b", default=None), }).gen_converter(debug=False) assert conv(input_data) == { "a": [5, 10, 10], "ab_sum": 28, "b": ["foo", "bar"], "b_max_a": "bar", }
def test_weighted_average(series): assert eq( c.aggregate(c.ReduceFuncs.Average(c.item(0), c.item(1))).execute(series), weighted_average(series), ) result = (c.group_by(c.item(0) // 5).aggregate([ c.item(0) // 5, c.ReduceFuncs.Average(c.item(1)), c.ReduceFuncs.Average(c.item(1), where=c.item(0) > 10, default=-1), ]).execute(zip(range(10), range(10)), debug=False)) assert result == [ [0, 2, -1], [1, 7, -1], ] result = (c.group_by(c.item(0) // 5).aggregate([ c.item(0) // 5, c.ReduceFuncs.Average(c.item(1), c.item(2)), c.ReduceFuncs.Average(c.item(1), c.item(2), where=c.item(0) > 10, default=-1), ]).execute(zip(range(10), range(10), cycle([1, 2])), debug=False)) assert result == [ [0, 2, -1], [1, 7, -1], ]
def test_simple_label(): conv1 = (c.tuple(c.item(2).add_label("a"), c.this()).pipe( c.item(1).pipe(c.list_comp( (c.this(), c.label("a"))))).gen_converter(debug=False)) assert conv1([1, 2, 3, 4]) == [(1, 3), (2, 3), (3, 3), (4, 3)] conv2 = (c.tuple(c.item(1).add_label("a"), c.this()).pipe( c.item(1), label_input={ "aa": c.item(0), "bb": c.item(0) }, label_output="collection1", ).pipe( c.label("collection1").pipe( c.aggregate( c.ReduceFuncs.Sum( c.this() + c.label("a") + c.label("aa") + c.input_arg("x") + c.label("collection1").item(0), ))), label_output="b", ).pipe(c.this() + c.label("b")).gen_converter(debug=False)) assert conv2([1, 2, 3, 4], x=10) == 140 conv3 = (c.tuple(c.item("default").add_label("default"), c.this()).pipe( c.item(1).pipe(c.item( "abc", default=c.label("default")))).gen_converter(debug=False)) assert conv3({"default": 1}) == 1 with pytest.raises(c.ConversionException): c.this().pipe(c.this(), label_input=1)
def test_group_by_with_double_ended_pipes(): input_data = [ { "value": 1 }, { "value": 2 }, ] # fmt: off conv = c.aggregate( c.item("value").pipe(c.ReduceFuncs.Sum(c.this())).pipe( c.this() * 2)).gen_converter() # fmt: on result = conv(input_data) assert result == 6 input_data = [ { "k": "A", "v": 1 }, { "k": "A", "v": 2 }, ] reducer = c.ReduceFuncs.Sum(c.item("v")) conv = (c.group_by(c.item("k")).aggregate({ "v1": c.input_arg("test").pipe(reducer), "v2": reducer, }).gen_converter()) assert conv(input_data, test={"v": 7}) == [{"v1": 14, "v2": 3}]
def test_weighted_average(series): assert eq( c.aggregate(c.ReduceFuncs.Average(c.item(0), c.item(1))).execute( series ), weighted_average(series), )
def test_nested_group_by(): data = [ [0, [1, 2, 3]], [0, [4, 5, 6]], [1, [2, 3, 4]], ] assert c.group_by(c.item(0)).aggregate( ( c.item(0), c.ReduceFuncs.Sum( c.item(1).pipe(c.aggregate(c.ReduceFuncs.Sum(c.this()))) ), ) ).execute(data, debug=False) == [ (0, 21), (1, 9), ] agg_conv = c.aggregate(c.ReduceFuncs.Sum(c.this())) assert c.group_by(c.item(0)).aggregate( ( c.item(0), c.if_(c.item(1), c.item(1), c.item(1),).pipe( c.if_(c.this(), c.this(), c.this(),).pipe( c.ReduceFuncs.Sum( c.if_( c.this(), c.this(), c.this(), ) .pipe((agg_conv, agg_conv)) .pipe(c.item(1)) ).pipe( c.if_( c.this(), c.this(), c.this(), ) ), ) ), ) ).execute(data, debug=True) == [ (0, 21), (1, 9), ]
def test_pipe_conversion(): from convtools import conversion as c from convtools.base import PipeConversion assert PipeConversion(c.naive([1, 2, 3]), c.item(1)).execute(None) == 2 assert (PipeConversion(c.item("key1"), c.item("key2")).execute({"key1": { "key2": 3 }}, debug=False) == 3) assert (c.this.pipe(c.list_comp(c.this + 1)).filter(c.this > 3).execute( [1, 2, 3, 4, 5, 6], debug=False)) == [4, 5, 6, 7] c.aggregate( c.ReduceFuncs.Array(c.item("key"), default=list).pipe( c.if_( c.call_func(any, c.generator_comp(c.this.is_(None))), c.call_func(list), c.this, ))).gen_converter(debug=False)
def test_legacy_dict_reduce_approach(dict_series): output = c.aggregate( c.reduce( c.ReduceFuncs.DictSum, (c.item("name"), c.item("value")), )).execute(dict_series) assert output == { "Nick": 3, "John": 63, } with pytest.raises(ValueError): c.ReduceFuncs.DictSum(c.this(), c.this(), c.this()) with pytest.raises(ValueError): c.ReduceFuncs.DictSum({c.this(), c.this()})
def test_is_independent(): assert c(0).is_independent() assert c(int).is_independent() assert c(int).call().is_independent() assert c.label("a").is_independent() assert c.inline_expr("{}()").pass_args(int).is_independent() assert c.escaped_string("int()").is_independent() assert c({"a": c.input_arg("key")}).is_independent() assert not c.iter({"a": 1}).is_independent() assert not c.this.is_independent() assert not c({"a": 1}).item("a").is_independent() assert not c({"a": 1}).item(c.item("a")).is_independent() assert not c.inline_expr("{}()").pass_args(c.this).is_independent() assert not c.aggregate({"a": 1}).is_independent() assert not c.this.add_label("a").is_independent() assert not c(int).call(c.item(0)).is_independent()
def test_join_with_complex_pipe(): def f(l): return l + [1, 3] pipeline = (c.aggregate(c.ReduceFuncs.Array(c.item("a"))).pipe( c.join(c.this(), c.call_func(f, c.this()), c.LEFT == c.RIGHT)).iter(c.item(1)).as_type(list)) assert (pipeline.execute([ { "a": 1 }, { "a": 2 }, { "a": 3 }, ]) == [1, 1, 2, 3, 3])
def test_reducer_inlining(dict_series): def f(): f.number_of_calls += 1 if f.number_of_calls > f.max_number_of_calls: raise Exception return [] f.max_number_of_calls = 1 f.number_of_calls = 0 converter = c.aggregate( c.ReduceFuncs.Array(c.item("name"), default=f, where=c.item("value") < 0).pipe( c.if_( if_true=c.this(), if_false=c.this(), ))).gen_converter(debug=False) assert converter(dict_series) == []
def test_aggregate(): input_data = [ {"a": 5, "b": "foo"}, {"a": 10, "b": "bar"}, {"a": 10, "b": "bar"}, ] conv = c.aggregate( { "a": c.reduce(c.ReduceFuncs.Array, c.item("a")), "a_sum": c.reduce(c.ReduceFuncs.Sum, c.item("a")), "b": c.reduce(c.ReduceFuncs.ArrayDistinct, c.item("b")), } ).gen_converter(debug=True) assert conv(input_data) == { "a": [5, 10, 10], "a_sum": 25, "b": ["foo", "bar"], }
def test_top_k_invalid_input(k): with pytest.raises(TypeError): c.aggregate(c.ReduceFuncs.TopK(k, c.this())).execute([1, 2]),
def test_grouping(): data = [ { "name": "John", "category": "Games", "debit": 10, "balance": 90 }, { "name": "John", "category": "Games", "debit": 200, "balance": -110 }, { "name": "John", "category": "Food", "debit": 30, "balance": -140 }, { "name": "John", "category": "Games", "debit": 300, "balance": 0 }, { "name": "Nick", "category": "Food", "debit": 7, "balance": 50 }, { "name": "Nick", "category": "Games", "debit": 18, "balance": 32 }, { "name": "Bill", "category": "Games", "debit": 18, "balance": 120 }, ] result = (c.group_by(c.item("name")).aggregate(( c.item("name"), c.item("name").call_method("lower"), c.call_func(str.lower, c.item("name")), c.reduce( lambda a, b: a + b, c.item("debit"), initial=c.input_arg("arg1"), unconditional_init=True, ), c.reduce( c.inline_expr("{0} + {1}"), c.item("debit"), initial=lambda: 100, unconditional_init=True, ), c.reduce( max, c.item("debit"), prepare_first=lambda a: a, default=c.input_arg("arg1"), where=c.call_func(lambda x: x < 0, c.item("balance")), ), c.call_func( lambda max_debit, n: max_debit * n, c.reduce( max, c.item("debit"), prepare_first=lambda a: a, default=0, where=c.call_func(lambda x: x < 0, c.item("balance")), ), 1000, ), c.call_func( lambda max_debit, n: max_debit * n, c.reduce( c.ReduceFuncs.Max, c.item("debit"), default=1000, where=c.inline_expr("{0} > {1}").pass_args( c.item("balance"), c.input_arg("arg2"), ), ), -1, ), c.reduce(c.ReduceFuncs.MaxRow, c.item("debit")).item("balance"), c.reduce(c.ReduceFuncs.MinRow, c.item("debit")).item("balance"), )).sort(key=lambda t: t[0].lower(), reverse=True).execute(data, arg1=100, arg2=0, debug=False)) # fmt: off assert result == [ ('Nick', 'nick', 'nick', 125, 125, 100, 0, -18, 32, 50), ('John', 'john', 'john', 640, 640, 200, 200000, -10, 0, 90), ('Bill', 'bill', 'bill', 118, 118, 100, 0, -18, 120, 120), ] # fmt: on with pytest.raises(c.ConversionException): # there's a single group by field, while we use separate items # of this tuple in aggregate result = (c.group_by(c.item("name")).aggregate(( c.item("category"), c.reduce(c.ReduceFuncs.Sum, c.item("debit")), )).execute(data, debug=False)) aggregation = { c.call_func( tuple, c.ReduceFuncs.Array(c.item("name"), default=None), ): c.item("category").call_method("lower"), "count": c.ReduceFuncs.Count(), "max": c.ReduceFuncs.Max(c.item("debit")), "min": c.ReduceFuncs.Min(c.item("debit")), "count_distinct": c.ReduceFuncs.CountDistinct(c.item("name")), "array_agg_distinct": c.ReduceFuncs.ArrayDistinct(c.item("name")), "dict": c.ReduceFuncs.Dict(c.item("debit"), c.item("name")), } result = (c.group_by(c.item("category")).aggregate(aggregation).execute( data, debug=False)) result2 = (c.group_by(c.item("category")).aggregate( c.dict(*aggregation.items())).execute(data, debug=False)) # fmt: off assert result == result2 == [ { 'array_agg_distinct': ['John', 'Nick', 'Bill'], 'count': 5, 'count_distinct': 3, 'dict': { 10: 'John', 18: 'Bill', 200: 'John', 300: 'John' }, 'max': 300, 'min': 10, ('John', 'John', 'John', 'Nick', 'Bill'): 'games' }, { 'array_agg_distinct': ['John', 'Nick'], 'count': 2, 'count_distinct': 2, 'dict': { 7: 'Nick', 30: 'John' }, 'max': 30, 'min': 7, ('John', 'Nick'): 'food' } ] # fmt: on result3 = (c.aggregate(c.ReduceFuncs.Sum(c.item("debit"))).pipe( c.inline_expr("{0} + {1}").pass_args(c.this(), c.this())).execute(data, debug=False)) assert result3 == 583 * 2 by = c.item("name"), c.item("category") result4 = (c.group_by( *by).aggregate(by + (c.ReduceFuncs.Sum(c.item("debit")), )).execute( data, debug=False)) # fmt: off assert result4 == [('John', 'Games', 510), ('John', 'Food', 30), ('Nick', 'Food', 7), ('Nick', 'Games', 18), ('Bill', 'Games', 18)] # fmt: on result5 = (c.group_by().aggregate(c.ReduceFuncs.Sum( c.item("debit"))).execute(data, debug=False)) assert result5 == 583 with pytest.raises(c.ConversionException): # there's a single group by field, while we use separate items # of this tuple in aggregate (c.group_by(by).aggregate( by + (c.reduce(c.ReduceFuncs.Sum, c.item("debit")), )).execute( data, debug=False))
def test_average(series): assert eq( c.aggregate(c.ReduceFuncs.Average(c.item(1))).execute(series), statistics.mean(x[1] for x in series), )
def test_doc__index_word_count(): # Let's say we need to count words across all files input_data = [ "war-and-peace-1.txt", "war-and-peace-2.txt", "war-and-peace-3.txt", "war-and-peace-4.txt", ] # # iterate an input and read file lines # # def read_file(filename): # with open(filename) as f: # for line in f: # yield line # extract_strings = c.generator_comp(c.call_func(read_file, c.this())) # to simplify testing extract_strings = c.generator_comp( c.call_func(lambda filename: [filename], c.this())) # 1. make ``re`` pattern available to the code to be generated # 2. call ``finditer`` method of the pattern and pass the string # as an argument # 3. pass the result to the next conversion # 4. iterate results, call ``.group()`` method of each re.Match # and call ``.lower()`` on each result split_words = (c.naive(re.compile(r"\w+")).call_method( "finditer", c.this()).pipe( c.generator_comp(c.this().call_method("group", 0).call_method("lower")))) # ``extract_strings`` is the generator of strings # so we iterate it and pass each item to ``split_words`` conversion vectorized_split_words = c.generator_comp(c.this().pipe(split_words)) # flattening the result of ``vectorized_split_words``, which is # a generator of generators of strings flatten = c.call_func( chain.from_iterable, c.this(), ) # aggregate the input, the result is a single dict # words are keys, values are count of words dict_word_to_count = c.aggregate( c.ReduceFuncs.DictCount(c.this(), c.this(), default=dict)) # take top N words by: # - call ``.items()`` method of the dict (the result of the aggregate) # - pass the result to ``sorted`` # - take the slice, using input argument named ``top_n`` # - cast to a dict take_top_n = (c.this().call_method("items").sort( key=lambda t: t[1], reverse=True).pipe(c.this()[:c.input_arg("top_n")]).as_type(dict)) # the resulting pipeline is pretty self-descriptive, except the ``c.if_`` # part, which checks the condition (first argument), # and returns the 2nd if True OR the 3rd (input data by default) otherwise pipeline = ( extract_strings.pipe(flatten).pipe(vectorized_split_words).pipe( flatten).pipe(dict_word_to_count).pipe( c.if_( c.input_arg("top_n").is_not(None), c.this().pipe(take_top_n), )) # Define the resulting converter function signature. In fact this # isn't necessary if you don't need to specify default values ).gen_converter(debug=True, signature="data_, top_n=None") assert pipeline(input_data, top_n=3) == {"war": 4, "and": 4, "peace": 4}
def test_grouping(): data = [ { "name": "John", "category": "Games", "debit": 10, "balance": 90 }, { "name": "John", "category": "Games", "debit": 200, "balance": -110 }, { "name": "John", "category": "Food", "debit": 30, "balance": -140 }, { "name": "John", "category": "Games", "debit": 300, "balance": 0 }, { "name": "Nick", "category": "Food", "debit": 7, "balance": 50 }, { "name": "Nick", "category": "Games", "debit": 18, "balance": 32 }, { "name": "Bill", "category": "Games", "debit": 18, "balance": 120 }, ] result = (c.group_by(c.item("name")).aggregate(( c.item("name"), c.item("name").call_method("lower"), c.call_func(str.lower, c.item("name")), c.reduce( lambda a, b: a + b, c.item("debit"), initial=c.input_arg("arg1"), ), c.reduce( c.inline_expr("{0} + {1}"), c.item("debit"), initial=lambda: 100, ), c.reduce(max, c.item("debit"), default=c.input_arg("arg1")).filter( c.call_func(lambda x: x < 0, c.item("balance"))), c.call_func( lambda max_debit, n: max_debit * n, c.reduce(max, c.item("debit"), default=0).filter( c.call_func(lambda x: x < 0, c.item("balance"))), 1000, ), c.call_func( lambda max_debit, n: max_debit * n, c.reduce( c.ReduceFuncs.Max, c.item("debit"), default=1000, ).filter(c.inline_expr("{0} > 0").pass_args(c.item("balance"))), -1, ), c.reduce( c.ReduceFuncs.MaxRow, c.item("debit"), ).item("balance"), c.reduce( c.ReduceFuncs.MinRow, c.item("debit"), ).item("balance"), )).sort(key=lambda t: t[0].lower(), reverse=True).execute(data, arg1=100, debug=False)) # fmt: off assert result == [ ('Nick', 'nick', 'nick', 125, 125, 100, 0, -18, 32, 50), ('John', 'john', 'john', 640, 640, 200, 200000, -10, 0, 90), ('Bill', 'bill', 'bill', 118, 118, 100, 0, -18, 120, 120) ] # fmt: on aggregation = { c.call_func( tuple, c.reduce(c.ReduceFuncs.Array, c.item("name"), default=None), ): c.item("category").call_method("lower"), "count": c.reduce(c.ReduceFuncs.Count), "count_distinct": c.reduce(c.ReduceFuncs.CountDistinct, c.item("name")), "array_agg_distinct": c.reduce( c.ReduceFuncs.ArrayDistinct, c.item("name"), ), "dict": c.reduce(c.ReduceFuncs.Dict, (c.item("debit"), c.item("name"))), } result = (c.group_by(c.item("category")).aggregate(aggregation).execute( data, debug=False)) result2 = (c.group_by(c.item("category")).aggregate( c.dict(*aggregation.items())).execute(data, debug=False)) # fmt: off assert result == result2 == [ { 'array_agg_distinct': ['John', 'Nick', 'Bill'], 'count': 5, 'count_distinct': 3, 'dict': { 10: 'John', 18: 'Bill', 200: 'John', 300: 'John' }, ('John', 'John', 'John', 'Nick', 'Bill'): 'games' }, { 'array_agg_distinct': ['John', 'Nick'], 'count': 2, 'count_distinct': 2, 'dict': { 7: 'Nick', 30: 'John' }, ('John', 'Nick'): 'food' } ] # fmt: on result3 = (c.aggregate(c.reduce(c.ReduceFuncs.Sum, c.item("debit"))).pipe( c.inline_expr("{0} + {1}").pass_args(c.this(), c.this())).execute(data, debug=False)) assert result3 == 583 * 2 by = c.item("name"), c.item("category") result4 = (c.group_by(*by).aggregate(by + ( c.reduce(c.ReduceFuncs.Sum, c.item("debit")), )).execute(data, debug=False)) # fmt: off assert result4 == [('John', 'Games', 510), ('John', 'Food', 30), ('Nick', 'Food', 7), ('Nick', 'Games', 18), ('Bill', 'Games', 18)] # fmt: on result5 = (c.group_by().aggregate( c.reduce(c.ReduceFuncs.Sum, c.item("debit"))).execute(data, debug=False)) assert result5 == 583
def test_base_reducer(): from convtools.aggregations import _ReducerExpression, _ReducerStatements assert c.aggregate(( c.reduce( _ReducerExpression(lambda a, b: a + b, expr=c.this(), initial=0)), c.reduce( _ReducerExpression(c.naive(lambda a, b: a + b), expr=c.this(), initial=int)), c.reduce(_ReducerExpression("{0} + {1}", expr=c.this(), default=0)), c.reduce( _ReducerExpression( "{0} + {1}", expr=c.this(), initial_from_first=int, default=0, )), c.reduce( _ReducerStatements( reduce="%(result)s += ({1} or 0)", initial_from_first="%(result)s = ({0} or 0)", default=0, ), c.this(), ), c.reduce( _ReducerStatements( reduce="%(result)s += ({1} or 0)", default=c.naive(int), ), c.this(), ), c.reduce( _ReducerStatements( reduce="%(result)s = ({1} or 0)", initial=0, ), c.this(), ), )).filter(c.this() > 5, cast=tuple).gen_converter(debug=True)([1, 2, 3]) == ( 6, 6, 6, 6, 6, 6, ) with pytest.raises(AssertionError): c.aggregate((c.reduce( c.ReduceFuncs.Sum, c.reduce(c.ReduceFuncs.Count), ), )).gen_converter() conv = c.aggregate( c.reduce(c.ReduceFuncs.DictArray, (c.item(0), c.item(1)))).gen_converter(debug=True) data = [ ("a", 1), ("a", 2), ("b", 3), ] result = {"a": [1, 2], "b": [3]} assert conv(data) == result assert conv([]) is None conv2 = c.aggregate({ "key": c.reduce(c.ReduceFuncs.DictArray, (c.item(0), c.item(1))) }).gen_converter(debug=True) assert conv2([]) == {"key": None} assert conv2(data) == {"key": result}
def test_average_of_empty_collection(): assert c.aggregate(c.ReduceFuncs.Average(c.item(1))).execute([]) is None
def test_doc__index_intro(): # ======== # # GROUP BY # # ======== # input_data = [ { "a": 5, "b": "foo" }, { "a": 10, "b": "foo" }, { "a": 10, "b": "bar" }, { "a": 10, "b": "bar" }, { "a": 20, "b": "bar" }, ] conv = (c.group_by(c.item("b")).aggregate({ "b": c.item("b"), "a_first": c.ReduceFuncs.First(c.item("a")), "a_max": c.ReduceFuncs.Max(c.item("a")), }).gen_converter(debug=True)) assert conv(input_data) == [ { "b": "foo", "a_first": 5, "a_max": 10 }, { "b": "bar", "a_first": 10, "a_max": 20 }, ] # ========= # # AGGREGATE # # ========= # conv = c.aggregate({ # list of "a" values where "b" equals to "bar" "a": c.ReduceFuncs.Array(c.item("a"), where=c.item("b") == "bar"), # "b" value of a row where "a" has Max value "b": c.ReduceFuncs.MaxRow(c.item("a"), ).item("b", default=None), }).gen_converter(debug=True) assert conv(input_data) == {"a": [10, 10, 20], "b": "bar"} # ==== # # JOIN # # ==== # collection_1 = [ { "id": 1, "name": "Nick" }, { "id": 2, "name": "Joash" }, { "id": 3, "name": "Bob" }, ] collection_2 = [ { "ID": "3", "age": 17, "country": "GB" }, { "ID": "2", "age": 21, "country": "US" }, { "ID": "1", "age": 18, "country": "CA" }, ] input_data = (collection_1, collection_2) conv = (c.join( c.item(0), c.item(1), c.and_( c.LEFT.item("id") == c.RIGHT.item("ID").as_type(int), c.RIGHT.item("age") >= 18, ), how="left", ).pipe( c.list_comp({ "id": c.item(0, "id"), "name": c.item(0, "name"), "age": c.item(1, "age", default=None), "country": c.item(1, "country", default=None), })).gen_converter(debug=True)) assert conv(input_data) == [ { "id": 1, "name": "Nick", "age": 18, "country": "CA" }, { "id": 2, "name": "Joash", "age": 21, "country": "US" }, { "id": 3, "name": "Bob", "age": None, "country": None }, ]
def test_mode(series): assert eq( c.aggregate(c.ReduceFuncs.Mode(c.item(0))).execute(series), statistics.mode(x[0] for x in series), )
def test_nested_group_by(): data = [ [0, [1, 2, 3]], [0, [4, 5, 6]], [1, [2, 3, 4]], ] assert c.group_by(c.item(0)).aggregate(( c.item(0), c.ReduceFuncs.Sum( c.item(1).pipe(c.aggregate(c.ReduceFuncs.Sum(c.this())))), )).execute(data, debug=False) == [ (0, 21), (1, 9), ] agg_conv = c.aggregate(c.ReduceFuncs.Sum(c.this())) assert c.group_by(c.item(0)).aggregate(( c.item(0), c.if_( c.item(1), c.item(1), c.item(1), ).pipe( c.if_( c.this(), c.this(), c.this(), ).pipe( c.ReduceFuncs.Sum( c.if_( c.this(), c.this(), c.this(), ).pipe((agg_conv, agg_conv)).pipe(c.item(1))).pipe( c.if_( c.this(), c.this(), c.this(), )), )), )).execute(data, debug=False) == [ (0, 21), (1, 9), ] summer = c.aggregate(c.ReduceFuncs.Sum(c.this())) merger = c.aggregate({ "value1": c.ReduceFuncs.First(c.item("value1"), where=c("value1").in_(c.this())), "value2": c.ReduceFuncs.First(c.item("value2"), where=c("value2").in_(c.this())).pipe( c.if_(c.this(), c.this().pipe(summer))), }) converter = (c.group_by(c.item("id_")).aggregate({ "id_": c.item("id_"), "data": c.ReduceFuncs.Array(c.this()).pipe(merger), }).gen_converter(debug=False)) assert converter([ { "id_": 1, "value1": 2 }, { "id_": 2, "value1": 3 }, { "id_": 2, "value2": [1, 2, 3] }, ]) == [ { "id_": 1, "data": { "value1": 2, "value2": None } }, { "id_": 2, "data": { "value1": 3, "value2": 6 } }, ] def g(): yield 1 raise Exception assert (c.aggregate(c.ReduceFuncs.First(c.this())).execute( g(), debug=False)) == 1
def test_median(series): assert eq( c.aggregate(c.ReduceFuncs.Median(c.item(1))).execute(series), statistics.median(x[1] for x in series), )
def test_top_k(series, k): assert eq( c.aggregate(c.ReduceFuncs.TopK(k, c.item(1))).execute(series), [x[1] for x in Counter(x[1] for x in series).most_common(k)], )
def test_top_k_non_positive_int(k): with pytest.raises(ValueError): c.aggregate(c.ReduceFuncs.TopK(k, c.this())).execute([1, 2]),