def test_weighted_average(series): assert eq( c.aggregate(c.ReduceFuncs.Average(c.item(0), c.item(1))).execute(series), weighted_average(series), ) result = (c.group_by(c.item(0) // 5).aggregate([ c.item(0) // 5, c.ReduceFuncs.Average(c.item(1)), c.ReduceFuncs.Average(c.item(1), where=c.item(0) > 10, default=-1), ]).execute(zip(range(10), range(10)), debug=False)) assert result == [ [0, 2, -1], [1, 7, -1], ] result = (c.group_by(c.item(0) // 5).aggregate([ c.item(0) // 5, c.ReduceFuncs.Average(c.item(1), c.item(2)), c.ReduceFuncs.Average(c.item(1), c.item(2), where=c.item(0) > 10, default=-1), ]).execute(zip(range(10), range(10), cycle([1, 2])), debug=False)) assert result == [ [0, 2, -1], [1, 7, -1], ]
def test_group_by_with_double_ended_pipes(): input_data = [ { "value": 1 }, { "value": 2 }, ] # fmt: off conv = c.aggregate( c.item("value").pipe(c.ReduceFuncs.Sum(c.this())).pipe( c.this() * 2)).gen_converter() # fmt: on result = conv(input_data) assert result == 6 input_data = [ { "k": "A", "v": 1 }, { "k": "A", "v": 2 }, ] reducer = c.ReduceFuncs.Sum(c.item("v")) conv = (c.group_by(c.item("k")).aggregate({ "v1": c.input_arg("test").pipe(reducer), "v2": reducer, }).gen_converter()) assert conv(input_data, test={"v": 7}) == [{"v1": 14, "v2": 3}]
def test_multi_statement_reducers(dict_series): output = (c.group_by(c.item("name")).aggregate(( c.item("name"), SumReducer1(c.item("value")), SumReducer2(c.item("value")), SumReducer3(c.item("value")), SumReducer4(c.item("value")), SumReducer5(c.item("value"), initial=5), )).execute(dict_series, debug=False)) assert output == [("Nick", 3, 3, 3, 3, 8), ("John", 63, 63, 63, 63, 68)] with pytest.raises(AttributeError): class SumReducer(MultiStatementReducer): reduce = ("%(result)s = %(result)s + ({0} or 4)", ) default = 0 unconditional_init = True # prepare_first is not specified c.aggregate(SumReducer(c.item("value"))).gen_converter() with pytest.raises(ValueError): class SumReducer(MultiStatementReducer): reduce = ("%(result)s = %(result)s + ({0} or 4)", ) unconditional_init = True # default is not provided SumReducer(c.item("value"))
def test_multi_statement_reducers(dict_series): output = (c.group_by(c.item("name")).aggregate(( c.item("name"), SumReducer1(c.item("value")), SumReducer2(c.item("value")), SumReducer3(c.item("value")), SumReducer4(c.item("value")), SumReducer5(c.item("value"), initial=5), )).execute(dict_series, debug=False)) assert output == [("Nick", 3, 3, 3, 3, 8), ("John", 63, 63, 63, 63, 68)] with pytest.raises(ValueError): class SumReducer(MultiStatementReducer): reduce = ("%(result)s = {0} + ({1} or 4)", ) default = 0 unconditional_init = True SumReducer(c.item("value")) with pytest.raises(ValueError): class SumReducer(MultiStatementReducer): reduce = ("%(result)s = {0} + ({1} or 4)", ) unconditional_init = True SumReducer(c.item("value"))
def test_piped_group_by(): input_data = [ { "a": 5, "b": "foo", "amount": 1 }, { "a": 10, "b": "bar", "amount": 2 }, { "a": 10, "b": "bar", "amount": 3 }, ] assert c.group_by(c.item("a"), c.item("b")).aggregate({ "a": c.item("a"), "b": c.item("b"), "amount": c.ReduceFuncs.Sum(c.item("amount")), }).pipe( c.group_by(c.item("b")).aggregate({ "b": c.item("b"), "set_a": c.ReduceFuncs.ArrayDistinct(c.item("a")), "min_amount": c.ReduceFuncs.Min(c.item("amount")), })).execute(input_data) == [ { "b": "foo", "set_a": [5], "min_amount": 1 }, { "b": "bar", "set_a": [10], "min_amount": 5 }, ]
def test_nested_group_by(): data = [ [0, [1, 2, 3]], [0, [4, 5, 6]], [1, [2, 3, 4]], ] assert c.group_by(c.item(0)).aggregate( ( c.item(0), c.ReduceFuncs.Sum( c.item(1).pipe(c.aggregate(c.ReduceFuncs.Sum(c.this()))) ), ) ).execute(data, debug=False) == [ (0, 21), (1, 9), ] agg_conv = c.aggregate(c.ReduceFuncs.Sum(c.this())) assert c.group_by(c.item(0)).aggregate( ( c.item(0), c.if_(c.item(1), c.item(1), c.item(1),).pipe( c.if_(c.this(), c.this(), c.this(),).pipe( c.ReduceFuncs.Sum( c.if_( c.this(), c.this(), c.this(), ) .pipe((agg_conv, agg_conv)) .pipe(c.item(1)) ).pipe( c.if_( c.this(), c.this(), c.this(), ) ), ) ), ) ).execute(data, debug=True) == [ (0, 21), (1, 9), ]
def test_weighted_average_with_group_by(series): assert eq( c.group_by(c.item(0)).aggregate( c.ReduceFuncs.Average(c.item(0), c.item(1))).execute(series), [ weighted_average([x for x in series if x[0] == key]) for key in ordered_set(x[0] for x in series) ], )
def test_median_with_group_by(series): assert eq( c.group_by(c.item(0)).aggregate(c.ReduceFuncs.Median( c.item(1))).execute(series), [ statistics.median(x[1] for x in series if x[0] == key) for key in ordered_set(x[0] for x in series) ], )
def test_top_k_with_group_by(series, k): assert eq( c.group_by(c.item(0)).aggregate(c.ReduceFuncs.TopK( k, c.item(1))).execute(series), [[ x[1] for x in Counter(x[1] for x in series if x[0] == key).most_common(k) ] for key in ordered_set(x[0] for x in series)], )
def test_mode_with_groupby(): series = [(0, 1), (0, 1), (0, 2), (1, 1), (1, 2), (1, 2)] assert eq( (c.group_by(c.item(0)).aggregate(c.ReduceFuncs.Mode( c.item(1))).execute(series)), [ statistics.mode([x[1] for x in series if x[0] == key]) for key in ordered_set(x[0] for x in series) ], )
def test_group_by_key_edge_case(): with pytest.raises(ValueError): c.this.add_label("row").pipe(c.ReduceFuncs.Count()) with pytest.raises(ValueError): (c.this.add_label("row") + 1).pipe(c.ReduceFuncs.Count() + 1) with pytest.raises(ValueError): c.this.pipe(c.ReduceFuncs.Count(), label_input="row") data = [ (0, 1), (1, 2), ] assert c.group_by(c.item(0)).aggregate( c.if_(c.item(1), c.item(1), c.item(1)).pipe( (c.ReduceFuncs.Sum(c.this) / c.ReduceFuncs.Count(c.this)).pipe(c.this + 10))).gen_converter( debug=False)(data) == [11, 12] assert c.group_by(c.item(0)).aggregate( c.item(1).pipe( c.ReduceFuncs.Sum(c.this), label_output="count")).gen_converter(debug=False)(data) == [1, 2]
def test_reducers(): for config in reducers_in_out: converter = (c.group_by(config["groupby"]).aggregate( (config["groupby"], config["reduce"])).gen_converter(debug=config.get("debug", True))) if config["raises"]: with pytest.raises(config["raises"]): converter(config["data"]) else: data = converter(config["data"]) assert data == config["output"]
def test_conditional_init_merges(): converter = (c.group_by(c.item(0)).aggregate([ c.ReduceFuncs.First(c.item(1)), c.ReduceFuncs.First(c.item(2)), c.ReduceFuncs.Min(c.item(1)), c.ReduceFuncs.Max(c.item(1)), c.ReduceFuncs.Min(c.item(2)), c.ReduceFuncs.Max(c.item(2)), c.ReduceFuncs.DictMin(c.item(0), c.item(1)), c.ReduceFuncs.DictMin(c.item(0), c.item(2)), c.ReduceFuncs.DictMax(c.item(0), c.item(1)), c.ReduceFuncs.DictMax(c.item(0), c.item(2)), c.ReduceFuncs.MaxRow(c.item(2)), c.ReduceFuncs.MaxRow(c.item(2), where=c.item(0) == 1).item(-1), ]).gen_converter(debug=True)) # fmt: off assert converter([ [1, 2, None], [1, 1, 4], [1, None, 3], ]) == [[2, None, 1, 2, 3, 4, { 1: 1 }, { 1: 3 }, { 1: 2 }, { 1: 4 }, [1, 1, 4], 4]] # fmt: on converter = (c.group_by(c.item(0)).aggregate([ c.ReduceFuncs.Min(c.item(1)), c.ReduceFuncs.Min(c.item(1)) + 1, ]).gen_converter(debug=True)) assert (sum( code_line.count("<") for code_line in next( conf["code_str"] for name, conf in converter._name_to_converter.items() if name.startswith("group_by"))) == 1)
def test_manually_defined_reducers(): data = [ { "name": "John", "category": "Games", "debit": 10, "balance": 90 }, { "name": "John", "category": "Games", "debit": 200, "balance": -110 }, { "name": "John", "category": "Food", "debit": 30, "balance": -140 }, { "name": "John", "category": "Games", "debit": 300, "balance": 0 }, { "name": "Nick", "category": "Food", "debit": 7, "balance": 50 }, { "name": "Nick", "category": "Games", "debit": 18, "balance": 32 }, { "name": "Bill", "category": "Games", "debit": 18, "balance": 120 }, ] grouper = (c.group_by(c.item("name")).aggregate( c.reduce(lambda a, b: a + b, c.item(c.input_arg("group_key")), initial=0)).filter(c.this() > 20).gen_converter( signature="data_, group_key='debit'")) assert grouper(data) == [540, 25] assert grouper(data, group_key="balance") == [82, 120]
def test_group_by_percentile(): input_data = [{ "key": key, "value": value } for index, key in enumerate("abc") for value in range(index + 90, -1, -1)] c_round = c.call_func(round, c.this, 2) result = (c.group_by(c.item("key")).aggregate({ "key": c.item("key"), "min": c.ReduceFuncs.Percentile(0, c.item("value")).pipe(c_round), "min": c.ReduceFuncs.Percentile(0, c.item("value"), where=c.and_(default=True)).pipe(c_round), "percentile_5": c.ReduceFuncs.Percentile(5, c.item("value")).pipe(c_round), "median": c.ReduceFuncs.Percentile(50, c.item("value")).pipe(c_round), "percentile_95": c.ReduceFuncs.Percentile(95, c.item("value")).pipe(c_round), "max": c.ReduceFuncs.Percentile(100, c.item("value")).pipe(c_round), }).execute(input_data)) assert result == [ { "key": "a", "max": 90, "median": 45.0, "min": 0.0, "percentile_5": 4.5, "percentile_95": 85.5, }, { "key": "b", "max": 91, "median": 45.5, "min": 0.0, "percentile_5": 4.55, "percentile_95": 86.45, }, { "key": "c", "max": 92, "median": 46.0, "min": 0.0, "percentile_5": 4.6, "percentile_95": 87.4, }, ]
def test_reducer_reuse(dict_series): f = lambda a, b: a + b reducer = c.reduce(f, c.item("value"), initial=0) reducer2 = c.reduce(f, c.item("value"), initial=0) output = (c.group_by(c.item("name")).aggregate(( c.item("name"), reducer + 10, reducer2 + 20, )).execute(dict_series)) assert output == [ ("Nick", 13, 23), ("John", 73, 83), ]
def test_group_by_key_edge_case(): with pytest.raises(ValueError): c.this().add_label("row").pipe(c.ReduceFuncs.Count()) with pytest.raises(ValueError): (c.this().add_label("row") + 1).pipe(c.ReduceFuncs.Count() + 1) with pytest.raises(ValueError): c.this().pipe(c.ReduceFuncs.Count(), label_input="row") data = [ (0, 1), (1, 2), ] # TODO: try to test nested pipe (double overwrites) # TODO: reducer + label then pipe to somewhere assert c.group_by(c.item(0)).aggregate( c.if_(c.item(1), c.item(1), c.item(1)).pipe( (c.ReduceFuncs.Sum(c.this()) / c.ReduceFuncs.Count(c.this())).pipe( c.this() + 10 ) ) ).gen_converter(debug=False)(data) == [11, 12] assert c.group_by(c.item(0)).aggregate( c.item(1).pipe(c.ReduceFuncs.Sum(c.this()), label_output="count") ).gen_converter(debug=False)(data) == [1, 2]
def test_iter_method(): assert (c.this.iter(c.this * 3).filter(c.this).as_type(list).execute( [1, 2, 3, 0, 1], debug=False, ) == [3, 6, 9, 3]) assert c.group_by(c.item(0)).aggregate( c([ c.item(0), c.item(1).pipe(c.ReduceFuncs.Max(c.this)), ]).iter(c.this * 100).as_type(tuple)).execute([(0, 1), (0, 2), (1, 7)], debug=False) == [ (0, 200), (100, 700), ]
def test_iter_mut_method(): assert c.iter(c.item(0)).as_type(list).execute([[1], [2]]) == [1, 2] assert c.iter_mut(c.Mut.custom(c.this.call_method("append", 7))).as_type( list ).execute([[1], [2]]) == [[1, 7], [2, 7]] result = ( c.this.iter({"a": c.this}) .iter_mut( c.Mut.set_item("b", c.item("a") + 1), c.Mut.set_item("c", c.item("a") + 2), ) .iter_mut( c.Mut.set_item("d", c.item("a") + 3), ) .as_type(list) .execute([1, 2, 3], debug=False) ) assert result == [ {"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 2, "b": 3, "c": 4, "d": 5}, {"a": 3, "b": 4, "c": 5, "d": 6}, ] result = ( c.group_by(c.item(0)) .aggregate( c( [ {c.item(0): c.item(1).pipe(c.ReduceFuncs.Max(c.this))}, {c.item(1).pipe(c.ReduceFuncs.Max(c.this)): c.item(0)}, ] ) .iter_mut( c.Mut.set_item( "x", c.call_func(sum, c.this.call_method("values")) + c.input_arg("base"), ) ) .as_type(tuple) ) .execute([(0, 1), (0, 2), (1, 7)], base=100, debug=False) ) assert result == [ ({0: 2, "x": 102}, {2: 0, "x": 100}), ({1: 7, "x": 107}, {7: 1, "x": 101}), ]
def test_zip_in_aggregate(): input_data = [ ("kitchen", "size", 10), ("kitchen", "temperature", 40), ("living_room", "size", 12), ("living_room", "color", "white"), ] converter = (c.group_by(c.item(1)).aggregate({ "prop": c.item(1), "values": c.zip( room=c.ReduceFuncs.Array(c.item(0)), value=c.ReduceFuncs.Array(c.item(2)), ).as_type(list), }).gen_converter()) assert converter(input_data) == [ { "prop": "size", "values": [ { "room": "kitchen", "value": 10 }, { "room": "living_room", "value": 12 }, ], }, { "prop": "temperature", "values": [{ "room": "kitchen", "value": 40 }] }, { "prop": "color", "values": [{ "room": "living_room", "value": "white" }], }, ]
def test_grouping(): data = [ { "name": "John", "category": "Games", "debit": 10, "balance": 90 }, { "name": "John", "category": "Games", "debit": 200, "balance": -110 }, { "name": "John", "category": "Food", "debit": 30, "balance": -140 }, { "name": "John", "category": "Games", "debit": 300, "balance": 0 }, { "name": "Nick", "category": "Food", "debit": 7, "balance": 50 }, { "name": "Nick", "category": "Games", "debit": 18, "balance": 32 }, { "name": "Bill", "category": "Games", "debit": 18, "balance": 120 }, ] result = (c.group_by(c.item("name")).aggregate(( c.item("name"), c.item("name").call_method("lower"), c.call_func(str.lower, c.item("name")), c.reduce( lambda a, b: a + b, c.item("debit"), initial=c.input_arg("arg1"), unconditional_init=True, ), c.reduce( c.inline_expr("{0} + {1}"), c.item("debit"), initial=lambda: 100, unconditional_init=True, ), c.reduce( max, c.item("debit"), prepare_first=lambda a: a, default=c.input_arg("arg1"), where=c.call_func(lambda x: x < 0, c.item("balance")), ), c.call_func( lambda max_debit, n: max_debit * n, c.reduce( max, c.item("debit"), prepare_first=lambda a: a, default=0, where=c.call_func(lambda x: x < 0, c.item("balance")), ), 1000, ), c.call_func( lambda max_debit, n: max_debit * n, c.reduce( c.ReduceFuncs.Max, c.item("debit"), default=1000, where=c.inline_expr("{0} > {1}").pass_args( c.item("balance"), c.input_arg("arg2"), ), ), -1, ), c.reduce(c.ReduceFuncs.MaxRow, c.item("debit")).item("balance"), c.reduce(c.ReduceFuncs.MinRow, c.item("debit")).item("balance"), )).sort(key=lambda t: t[0].lower(), reverse=True).execute(data, arg1=100, arg2=0, debug=False)) # fmt: off assert result == [ ('Nick', 'nick', 'nick', 125, 125, 100, 0, -18, 32, 50), ('John', 'john', 'john', 640, 640, 200, 200000, -10, 0, 90), ('Bill', 'bill', 'bill', 118, 118, 100, 0, -18, 120, 120), ] # fmt: on with pytest.raises(c.ConversionException): # there's a single group by field, while we use separate items # of this tuple in aggregate result = (c.group_by(c.item("name")).aggregate(( c.item("category"), c.reduce(c.ReduceFuncs.Sum, c.item("debit")), )).execute(data, debug=False)) aggregation = { c.call_func( tuple, c.ReduceFuncs.Array(c.item("name"), default=None), ): c.item("category").call_method("lower"), "count": c.ReduceFuncs.Count(), "max": c.ReduceFuncs.Max(c.item("debit")), "min": c.ReduceFuncs.Min(c.item("debit")), "count_distinct": c.ReduceFuncs.CountDistinct(c.item("name")), "array_agg_distinct": c.ReduceFuncs.ArrayDistinct(c.item("name")), "dict": c.ReduceFuncs.Dict(c.item("debit"), c.item("name")), } result = (c.group_by(c.item("category")).aggregate(aggregation).execute( data, debug=False)) result2 = (c.group_by(c.item("category")).aggregate( c.dict(*aggregation.items())).execute(data, debug=False)) # fmt: off assert result == result2 == [ { 'array_agg_distinct': ['John', 'Nick', 'Bill'], 'count': 5, 'count_distinct': 3, 'dict': { 10: 'John', 18: 'Bill', 200: 'John', 300: 'John' }, 'max': 300, 'min': 10, ('John', 'John', 'John', 'Nick', 'Bill'): 'games' }, { 'array_agg_distinct': ['John', 'Nick'], 'count': 2, 'count_distinct': 2, 'dict': { 7: 'Nick', 30: 'John' }, 'max': 30, 'min': 7, ('John', 'Nick'): 'food' } ] # fmt: on result3 = (c.aggregate(c.ReduceFuncs.Sum(c.item("debit"))).pipe( c.inline_expr("{0} + {1}").pass_args(c.this(), c.this())).execute(data, debug=False)) assert result3 == 583 * 2 by = c.item("name"), c.item("category") result4 = (c.group_by( *by).aggregate(by + (c.ReduceFuncs.Sum(c.item("debit")), )).execute( data, debug=False)) # fmt: off assert result4 == [('John', 'Games', 510), ('John', 'Food', 30), ('Nick', 'Food', 7), ('Nick', 'Games', 18), ('Bill', 'Games', 18)] # fmt: on result5 = (c.group_by().aggregate(c.ReduceFuncs.Sum( c.item("debit"))).execute(data, debug=False)) assert result5 == 583 with pytest.raises(c.ConversionException): # there's a single group by field, while we use separate items # of this tuple in aggregate (c.group_by(by).aggregate( by + (c.reduce(c.ReduceFuncs.Sum, c.item("debit")), )).execute( data, debug=False))
def test_blank_aggregate(series): assert eq( c.group_by(c.item(0)).aggregate(c.item(0)).execute(series), list({x[0]: 1 for x in series}), )
def test_group_by_with_pipes(): # fmt: off input_data = [ { "name": "John", "started_at": date(2020, 1, 1), "stopped_at": None, "product": "A" }, { "name": "John", "started_at": date(2020, 1, 1), "stopped_at": date(2020, 1, 2), "product": "B" }, { "name": "John", "started_at": date(2020, 1, 1), "stopped_at": None, "product": "C" }, { "name": "Nick", "started_at": date(2020, 1, 1), "stopped_at": None, "product": "D" }, { "name": "Nick", "started_at": date(2020, 2, 1), "stopped_at": None, "product": "D" }, { "name": "Nick", "started_at": date(2020, 2, 1), "stopped_at": None, "product": "E" }, ] # fmt: on output = (c.group_by( c.item("name"), c.item("started_at"), ).aggregate({ "name": c.item("name"), "started_at": c.item("started_at"), "products": c.ReduceFuncs.ArrayDistinct( c.if_( c.item("stopped_at").is_(None), c.item("product"), None, ), ).pipe(c.filter(c.this())).pipe( c.call_func(sorted, c.this()).pipe( c(", ").call_method("join", c.this()))).pipe(c.this()), }).execute(input_data)) # fmt: off assert output == [{ 'name': 'John', 'products': 'A, C', 'started_at': date(2020, 1, 1) }, { 'name': 'Nick', 'products': 'D', 'started_at': date(2020, 1, 1) }, { 'name': 'Nick', 'products': 'D, E', 'started_at': date(2020, 2, 1) }] # fmt: on reducer = c.ReduceFuncs.Array(c.this(), default=list) output = (c.group_by( c.this()["name"], c.this()["started_at"], ).aggregate({ "name": c.this()["name"], "started_at": c.this()["started_at"], "products": c.this()["product"].pipe(reducer)[:3], }).execute(input_data)) assert output == [ { "name": "John", "products": ["A", "B", "C"], "started_at": date(2020, 1, 1), }, { "name": "Nick", "products": ["D"], "started_at": date(2020, 1, 1), }, { "name": "Nick", "products": ["D", "E"], "started_at": date(2020, 2, 1), }, ]
def test_manually_defined_reducers(): data = [ { "name": "John", "category": "Games", "debit": 10, "balance": 90 }, { "name": "John", "category": "Games", "debit": 200, "balance": -110 }, { "name": "John", "category": "Food", "debit": 30, "balance": -140 }, { "name": "John", "category": "Games", "debit": 300, "balance": 0 }, { "name": "Nick", "category": "Food", "debit": 7, "balance": 50 }, { "name": "Nick", "category": "Games", "debit": 18, "balance": 32 }, { "name": "Bill", "category": "Games", "debit": 18, "balance": 120 }, ] grouper_base = c.group_by(c.item("name")).aggregate( c.reduce( lambda a, b: a + b, c.item(c.input_arg("group_key")), initial=int, default=int, )) grouper = grouper_base.filter(c.this > 20).gen_converter( signature="data_, group_key='debit'", debug=False) assert grouper(data) == [540, 25] assert list(grouper(data, group_key="balance")) == [82, 120] grouper = grouper_base.filter( (c.this > 20), cast=list).gen_converter(signature="data_, group_key='debit'", debug=False) assert grouper(data) == [540, 25] grouper = grouper_base.filter( (c.this > 20), cast=set).gen_converter(signature="data_, group_key='debit'", debug=False) assert grouper(data, group_key="balance") == {82, 120}
def test_nested_group_by(): data = [ [0, [1, 2, 3]], [0, [4, 5, 6]], [1, [2, 3, 4]], ] assert c.group_by(c.item(0)).aggregate(( c.item(0), c.ReduceFuncs.Sum( c.item(1).pipe(c.aggregate(c.ReduceFuncs.Sum(c.this())))), )).execute(data, debug=False) == [ (0, 21), (1, 9), ] agg_conv = c.aggregate(c.ReduceFuncs.Sum(c.this())) assert c.group_by(c.item(0)).aggregate(( c.item(0), c.if_( c.item(1), c.item(1), c.item(1), ).pipe( c.if_( c.this(), c.this(), c.this(), ).pipe( c.ReduceFuncs.Sum( c.if_( c.this(), c.this(), c.this(), ).pipe((agg_conv, agg_conv)).pipe(c.item(1))).pipe( c.if_( c.this(), c.this(), c.this(), )), )), )).execute(data, debug=False) == [ (0, 21), (1, 9), ] summer = c.aggregate(c.ReduceFuncs.Sum(c.this())) merger = c.aggregate({ "value1": c.ReduceFuncs.First(c.item("value1"), where=c("value1").in_(c.this())), "value2": c.ReduceFuncs.First(c.item("value2"), where=c("value2").in_(c.this())).pipe( c.if_(c.this(), c.this().pipe(summer))), }) converter = (c.group_by(c.item("id_")).aggregate({ "id_": c.item("id_"), "data": c.ReduceFuncs.Array(c.this()).pipe(merger), }).gen_converter(debug=False)) assert converter([ { "id_": 1, "value1": 2 }, { "id_": 2, "value1": 3 }, { "id_": 2, "value2": [1, 2, 3] }, ]) == [ { "id_": 1, "data": { "value1": 2, "value2": None } }, { "id_": 2, "data": { "value1": 3, "value2": 6 } }, ] def g(): yield 1 raise Exception assert (c.aggregate(c.ReduceFuncs.First(c.this())).execute( g(), debug=False)) == 1
def test_doc__index_intro(): # ======== # # GROUP BY # # ======== # input_data = [ { "a": 5, "b": "foo" }, { "a": 10, "b": "foo" }, { "a": 10, "b": "bar" }, { "a": 10, "b": "bar" }, { "a": 20, "b": "bar" }, ] conv = (c.group_by(c.item("b")).aggregate({ "b": c.item("b"), "a_first": c.ReduceFuncs.First(c.item("a")), "a_max": c.ReduceFuncs.Max(c.item("a")), }).gen_converter(debug=True)) assert conv(input_data) == [ { "b": "foo", "a_first": 5, "a_max": 10 }, { "b": "bar", "a_first": 10, "a_max": 20 }, ] # ========= # # AGGREGATE # # ========= # conv = c.aggregate({ # list of "a" values where "b" equals to "bar" "a": c.ReduceFuncs.Array(c.item("a"), where=c.item("b") == "bar"), # "b" value of a row where "a" has Max value "b": c.ReduceFuncs.MaxRow(c.item("a"), ).item("b", default=None), }).gen_converter(debug=True) assert conv(input_data) == {"a": [10, 10, 20], "b": "bar"} # ==== # # JOIN # # ==== # collection_1 = [ { "id": 1, "name": "Nick" }, { "id": 2, "name": "Joash" }, { "id": 3, "name": "Bob" }, ] collection_2 = [ { "ID": "3", "age": 17, "country": "GB" }, { "ID": "2", "age": 21, "country": "US" }, { "ID": "1", "age": 18, "country": "CA" }, ] input_data = (collection_1, collection_2) conv = (c.join( c.item(0), c.item(1), c.and_( c.LEFT.item("id") == c.RIGHT.item("ID").as_type(int), c.RIGHT.item("age") >= 18, ), how="left", ).pipe( c.list_comp({ "id": c.item(0, "id"), "name": c.item(0, "name"), "age": c.item(1, "age", default=None), "country": c.item(1, "country", default=None), })).gen_converter(debug=True)) assert conv(input_data) == [ { "id": 1, "name": "Nick", "age": 18, "country": "CA" }, { "id": 2, "name": "Joash", "age": 21, "country": "US" }, { "id": 3, "name": "Bob", "age": None, "country": None }, ]
def test_grouping(): data = [ { "name": "John", "category": "Games", "debit": 10, "balance": 90 }, { "name": "John", "category": "Games", "debit": 200, "balance": -110 }, { "name": "John", "category": "Food", "debit": 30, "balance": -140 }, { "name": "John", "category": "Games", "debit": 300, "balance": 0 }, { "name": "Nick", "category": "Food", "debit": 7, "balance": 50 }, { "name": "Nick", "category": "Games", "debit": 18, "balance": 32 }, { "name": "Bill", "category": "Games", "debit": 18, "balance": 120 }, ] result = (c.group_by(c.item("name")).aggregate(( c.item("name"), c.item("name").call_method("lower"), c.call_func(str.lower, c.item("name")), c.reduce( lambda a, b: a + b, c.item("debit"), initial=c.input_arg("arg1"), ), c.reduce( c.inline_expr("{0} + {1}"), c.item("debit"), initial=lambda: 100, ), c.reduce(max, c.item("debit"), default=c.input_arg("arg1")).filter( c.call_func(lambda x: x < 0, c.item("balance"))), c.call_func( lambda max_debit, n: max_debit * n, c.reduce(max, c.item("debit"), default=0).filter( c.call_func(lambda x: x < 0, c.item("balance"))), 1000, ), c.call_func( lambda max_debit, n: max_debit * n, c.reduce( c.ReduceFuncs.Max, c.item("debit"), default=1000, ).filter(c.inline_expr("{0} > 0").pass_args(c.item("balance"))), -1, ), c.reduce( c.ReduceFuncs.MaxRow, c.item("debit"), ).item("balance"), c.reduce( c.ReduceFuncs.MinRow, c.item("debit"), ).item("balance"), )).sort(key=lambda t: t[0].lower(), reverse=True).execute(data, arg1=100, debug=False)) # fmt: off assert result == [ ('Nick', 'nick', 'nick', 125, 125, 100, 0, -18, 32, 50), ('John', 'john', 'john', 640, 640, 200, 200000, -10, 0, 90), ('Bill', 'bill', 'bill', 118, 118, 100, 0, -18, 120, 120) ] # fmt: on aggregation = { c.call_func( tuple, c.reduce(c.ReduceFuncs.Array, c.item("name"), default=None), ): c.item("category").call_method("lower"), "count": c.reduce(c.ReduceFuncs.Count), "count_distinct": c.reduce(c.ReduceFuncs.CountDistinct, c.item("name")), "array_agg_distinct": c.reduce( c.ReduceFuncs.ArrayDistinct, c.item("name"), ), "dict": c.reduce(c.ReduceFuncs.Dict, (c.item("debit"), c.item("name"))), } result = (c.group_by(c.item("category")).aggregate(aggregation).execute( data, debug=False)) result2 = (c.group_by(c.item("category")).aggregate( c.dict(*aggregation.items())).execute(data, debug=False)) # fmt: off assert result == result2 == [ { 'array_agg_distinct': ['John', 'Nick', 'Bill'], 'count': 5, 'count_distinct': 3, 'dict': { 10: 'John', 18: 'Bill', 200: 'John', 300: 'John' }, ('John', 'John', 'John', 'Nick', 'Bill'): 'games' }, { 'array_agg_distinct': ['John', 'Nick'], 'count': 2, 'count_distinct': 2, 'dict': { 7: 'Nick', 30: 'John' }, ('John', 'Nick'): 'food' } ] # fmt: on result3 = (c.aggregate(c.reduce(c.ReduceFuncs.Sum, c.item("debit"))).pipe( c.inline_expr("{0} + {1}").pass_args(c.this(), c.this())).execute(data, debug=False)) assert result3 == 583 * 2 by = c.item("name"), c.item("category") result4 = (c.group_by(*by).aggregate(by + ( c.reduce(c.ReduceFuncs.Sum, c.item("debit")), )).execute(data, debug=False)) # fmt: off assert result4 == [('John', 'Games', 510), ('John', 'Food', 30), ('Nick', 'Food', 7), ('Nick', 'Games', 18), ('Bill', 'Games', 18)] # fmt: on result5 = (c.group_by().aggregate( c.reduce(c.ReduceFuncs.Sum, c.item("debit"))).execute(data, debug=False)) assert result5 == 583
def test_doc__quickstart_aggregation(): input_data = [ { "company_name": "Facebrochure", "company_hq": "CA", "app_name": "Tardygram", "date": "2019-01-01", "country": "US", "sales": Decimal("45678.98"), }, { "company_name": "Facebrochure", "company_hq": "CA", "app_name": "Tardygram", "date": "2019-01-02", "country": "US", "sales": Decimal("86869.12"), }, { "company_name": "Facebrochure", "company_hq": "CA", "app_name": "Tardygram", "date": "2019-01-03", "country": "CA", "sales": Decimal("45000.35"), }, { "company_name": "BrainCorp", "company_hq": "NY", "app_name": "Learn FT", "date": "2019-01-01", "country": "US", "sales": Decimal("86869.12"), }, ] # we are going to reuse this reducer top_sales_day = c.ReduceFuncs.MaxRow(c.item("sales")) # so the result is going to be a list of dicts converter = ( c.group_by(c.item("company_name")) .aggregate( { "company_name": c.item("company_name").call_method("upper"), # this would work as well # c.item("company_name"): ..., "none_sensitive_sum": c.ReduceFuncs.SumOrNone(c.item("sales")), # as you can see, next two reduce objects do the same except taking # different fields after finding a row with max value. # but please check the generated code below, you'll see that it is # calculated just once AND then reused to take necessary fields "top_sales_app": top_sales_day.item("app_name"), "top_sales_day": ( top_sales_day.item("date") .pipe( datetime.strptime, "%Y-%m-%d", ) .call_method("date") ), "company_hq": c.ReduceFuncs.First(c.item("company_hq")), "app_name_to_countries": c.ReduceFuncs.DictArrayDistinct( c.item("app_name"), c.item("country") ), "app_name_to_sales": c.ReduceFuncs.DictSum( c.item("app_name"), c.item("sales") ), } ) .gen_converter(debug=True) ) assert converter(input_data) == [ { "app_name_to_countries": {"Tardygram": ["US", "CA"]}, "app_name_to_sales": {"Tardygram": Decimal("177548.45")}, "company_hq": "CA", "company_name": "FACEBROCHURE", "none_sensitive_sum": Decimal("177548.45"), "top_sales_app": "Tardygram", "top_sales_day": date(2019, 1, 2), }, { "app_name_to_countries": {"Learn FT": ["US"]}, "app_name_to_sales": {"Learn FT": Decimal("86869.12")}, "company_hq": "NY", "company_name": "BRAINCORP", "none_sensitive_sum": Decimal("86869.12"), "top_sales_app": "Learn FT", "top_sales_day": date(2019, 1, 1), }, ]