def test_aggregation_multiple_functions_success(): df = util.iris(['class', 'sepalwidth'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['class'], 'function': return_funcs('sepalwidth') }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby('class').agg( sepal_avg=('sepalwidth', 'mean'), sepal_collect_list=('sepalwidth', _collect_list), sepal_collect_set=('sepalwidth', _collect_set), sepal_count=('sepalwidth', 'count'), sepal_first=('sepalwidth', 'first'), sepal_last=('sepalwidth', 'last'), sepal_max=('sepalwidth', 'max'), sepal_min=('sepalwidth', 'min'), sepal_sum=('sepalwidth', 'sum'), sepal_size=('sepalwidth', 'size')).reset_index() assert result['out'].equals(test_out)
def test_aggregation_asterisk_success(): df = util.iris(['class'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['class'], 'function': [{ 'attribute': '*', 'f': 'count', 'alias': 'class_count' }] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby(['class' ]).agg(class_count=('class', 'count')).reset_index() assert result['out'].equals(test_out)
def test_aggregation_pivot_table_success(): df = util.iris(['class', 'sepalwidth', 'petalwidth'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['petalwidth'], 'function': [{ 'attribute': 'petalwidth', 'f': 'count' }], 'pivot': ['class'], }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) aggfunc = {"petalwidth": ['count']} test_out = pd.pivot_table(test_out, index=['petalwidth'], columns=['class'], aggfunc=aggfunc) test_out.reset_index(inplace=True) new_idx = [ n[0] if n[1] == '' else "%s_%s_%s" % (n[0], n[1], n[2]) for n in test_out.columns ] test_out.columns = new_idx assert result['out'].equals(test_out)
def test_aggregation_non_numeric_attributes_success(): df = util.titanic(['homedest'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['homedest'], 'function': return_funcs('homedest', drop='avg') }, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby(['homedest']).agg( home_collect_list=('homedest', _collect_list), home_collect_set=('homedest', _collect_set), home_count=('homedest', 'count'), home_first=('homedest', 'first'), home_last=('homedest', 'last'), home_max=('homedest', 'max'), home_min=('homedest', 'min'), home_sum=('homedest', 'sum'), home_size=('homedest', 'size')).reset_index() assert result['out'].equals(test_out)
def test_aggregation_rows_minimal_params_success(): params = { AggregationOperation.FUNCTION_PARAM: [{ 'attribute': 'income', 'f': 'AVG', 'alias': 'avg_income' }], AggregationOperation.ATTRIBUTES_PARAM: ['country'] } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = AggregationOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" def _collect_list(x): return x.tolist() def _merge_set(x): return set(x.tolist()) columns = ['country'] target = {'income': ['avg_income']} operations = {'income': ['AVG']} output_1 = input_1.groupby(columns).agg(operations) new_idx = [] i = 0 old = None for (n1, n2) in output_1.columns.ravel(): if old != n1: old = n1 i = 0 new_idx.append(target[n1][i]) i += 1 output_1.columns = new_idx output_1 = output_1.reset_index() output_1.reset_index(drop=True, inplace=True) """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_aggregation_missing_input_implies_no_code_success(): arguments = { 'parameters': { 'attributes': ['class'], 'function': [{ 'attribute': 'class', 'f': 'count', 'alias': 'class_count' }] }, 'named_inputs': {}, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) assert instance.generate_code() is None
def test_aggregation_multiple_attributes_and_functions_success(): """You can pass multiple dicts to FUNCTION_PARAM and this allows to specify each parameter ('attribute', 'f' and 'alias'). In the test below, 'sepalwidth' receives 'sum' and 'size' with their respective aliases, and 'petalwidth' receives 'min' and 'max' also with their own aliases.""" df = util.iris(['sepalwidth', 'petalwidth', 'class'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['class'], 'function': [{ 'attribute': 'sepalwidth', 'f': 'sum', 'alias': 'sepal_sum' }, { 'attribute': 'sepalwidth', 'f': 'size', 'alias': 'sepal_size' }, { 'attribute': 'petalwidth', 'f': 'min', 'alias': 'petal_min' }, { 'attribute': 'petalwidth', 'f': 'max', 'alias': 'petal_max' }] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby(['class' ]).agg(sepal_sum=("sepalwidth", "sum"), sepal_size=("sepalwidth", "size"), petal_min=("petalwidth", "min"), petal_max=("petalwidth", "max")).reset_index() assert result['out'].equals(test_out)
def test_aggregation_non_numeric_attributes_fail(): df = util.titanic(['homedest'], size=150) arguments = { 'parameters': { 'attributes': ['homedest'], 'function': return_funcs('homedest') }, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) with pytest.raises(pd.core.base.DataError) as data_err: util.execute(instance.generate_code(), {'df': df}) assert "No numeric types to aggregate" in str(data_err.value)
def test_aggregation_with_pivot_values_success(): params = { AggregationOperation.ATTRIBUTES_PARAM: ["sex"], AggregationOperation.FUNCTION_PARAM: [{ "attribute": "fare", "f": "max", "alias": "sex" }], "pivot": ["class"], "pivot_values": [1, 2], } n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} instance = AggregationOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" def _collect_list(x): return x.tolist() def _merge_set(x): return set(x.tolist()) values = [1, 2] input_1 = input_1[input_1['class'].isin(values)] aggfunc = {'fare': ['max']} output_1 = pd.pivot_table(input_1, index=['sex'], values=['fare'], columns=['class'], aggfunc=aggfunc) # rename columns and convert to DataFrame output_1.reset_index(inplace=True) new_idx = [n[0] if n[1] is '' else "%s_%s_%s" % (n[0],n[1], n[2]) for n in output_1.columns.ravel()] output_1 = pd.DataFrame(output_1.to_records()) output_1.reset_index(drop=True, inplace=True) output_1 = output_1.drop(columns='index') output_1.columns = new_idx """) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg
def test_aggregation_success(): slice_size = 10 df = [ 'df', util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], slice_size) ] arguments = { 'parameters': {}, 'named_inputs': { 'input data': df[0], }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), dict([df])) assert result['out'].equals(util.iris(size=slice_size))
def test_aggregation_missing_attribute_param_fail(): df = util.iris(['class'], size=150) arguments = { 'parameters': { 'function': [{ 'attribute': 'class', 'f': 'count', 'alias': 'class_count' }] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) with pytest.raises(TypeError) as typ_err: util.execute(instance.generate_code(), {'df': df}) assert "You have to supply one of 'by' and 'level'" in str(typ_err.value)
def test_aggregation_invalid_function_param_alias_fail(): df = util.iris(['class'], size=150) arguments = { 'parameters': { 'attributes': ['class'], 'function': [{ 'attribute': 'class', 'f': 'count', 'alias': '' }] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) with pytest.raises(SyntaxError) as syn_err: util.execute(instance.generate_code(), {'df': df}) assert "invalid syntax" in str(syn_err.value)
def test_aggregation_invalid_pivot_table_fail(): df = util.iris(['class', 'sepalwidth', 'petalwidth'], size=150) arguments = { 'parameters': { 'attributes': ['petalwidth'], 'function': [{ 'attribute': 'petalwidth', 'f': 'count' }], 'pivot': 'invalid', }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) with pytest.raises(NameError) as nam_err: util.execute(instance.generate_code(), {'df': df}) assert "name 'invalid' is not defined" in str(nam_err.value)
def test_aggregation_missing_function_param_fail(): arguments = { 'parameters': { 'attributes': ['class'] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } with pytest.raises(ValueError) as val_err: AggregationOperation(**arguments) assert "Parameter 'function' must be informed for task" in str( val_err.value)
def test_aggregation_invalid_function_param_fail(): arguments = { 'parameters': { 'attributes': ['class'], 'function': 'invalid' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } with pytest.raises(TypeError) as typ_err: AggregationOperation(**arguments) assert "string indices must be integers" in str(typ_err.value)
def test_aggregation_missing_function_param_function_fail(): arguments = { 'parameters': { 'attributes': ['class'], 'function': [{ 'attribute': 'class', 'alias': 'class_count' }] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } with pytest.raises(KeyError) as key_err: AggregationOperation(**arguments) assert "f" in str(key_err.value)
def test_aggregation_missing_function_param_failure(): params = {AggregationOperation.ATTRIBUTES_PARAM: ['country']} n_in = {'input data': 'input_1'} n_out = {'output data': 'output_1'} with pytest.raises(ValueError): AggregationOperation(params, named_inputs=n_in, named_outputs=n_out)