def test_join_remove_right_with_case_columns_success(): params = { 'left_attributes': ['id', 'cod'], 'right_attributes': ['id2', 'cod2'], JoinOperation.KEEP_RIGHT_KEYS_PARAM: False, JoinOperation.MATCH_CASE_PARAM: True, 'aliases': '_left,_right' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" cols_to_remove = [c+'_right' for c in df2.columns if c in df1.columns] data1_tmp = df1[['id', 'cod']].applymap(lambda col: str(col).lower()) data1_tmp.columns = [c+"_lower" for c in data1_tmp.columns] data1_tmp = pd.concat([df1, data1_tmp], axis=1, sort=False) data2_tmp = df2[['id2', 'cod2']].applymap(lambda col: str(col).lower()) data2_tmp.columns = [c+"_lower" for c in data2_tmp.columns] data2_tmp = pd.concat([df2, data2_tmp], axis=1, sort=False) out = pd.merge(data1_tmp, data2_tmp, left_on=col1, right_on=col2, copy=False, suffixes=['_left', '_right'], how='inner') out.drop(col1+col2, axis=1, inplace=True) out.drop(cols_to_remove, axis=1, inplace=True)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_krk_param_success(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df = util.titanic(['name', 'homedest', 'embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': 1, 'match_case': '1', 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = ['name_l', 'homedest_l', 'embarked_r', 'name_r'] assert result['out'].equals(test_df)
def test_join_left_join_keep_columns_minimal_params_success(): params = { 'left_attributes': ['id', 'cod'], 'right_attributes': ['id', 'cod'], JoinOperation.JOIN_TYPE_PARAM: 'left', JoinOperation.KEEP_RIGHT_KEYS_PARAM: True, 'aliases': '_left,_right' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" {out} = pd.merge({in0}, {in1}, how='{type}', suffixes=['_left', '_right'], left_on=['id', 'cod'], right_on=['id', 'cod']) """.format( out=n_out['output data'], in0=n_in['input data 1'], in1=n_in['input data 2'], type=params[JoinOperation.JOIN_TYPE_PARAM], )) result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_match_case_param_success(): """ Match case converts a column to lower then, it adds a _lower to the column name and finally it drops the column. (Seems redundant...) """ df1 = util.titanic(['name', 'embarked'], size=10) df2 = util.titanic(['homedest', 'name'], size=10) test_df = util.titanic(['name', 'embarked', 'homedest'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = ['name_l', 'embarked_l', 'homedest_r'] assert result['out'].equals(test_df)
def test_join_outer_replace_success(): """ This only happens when you pass '_outer' """ df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': '_outer', 'left_attributes': ['homedest'], 'right_attributes': ['embarked'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) with pytest.raises(KeyError) as key_err: util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) assert '' in str(key_err.value)
def test_join_custom_suffixes_success(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df = util.titanic(['name', 'homedest', 'embarked'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'], 'aliases': '_esquerdo,_direito' }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = [ 'name_esquerdo', 'homedest_esquerdo', 'embarked_direito' ] assert result['out'].equals(test_df)
def test_join_missing_left_or_right_param_failure(): params = {'right_attributes': ['id', 'cod']} with pytest.raises(ValueError): n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} JoinOperation(params, named_inputs=n_in, named_outputs=n_out) params = {'left_attributes': ['id', 'cod']} with pytest.raises(ValueError): JoinOperation(params, named_inputs=n_in, named_outputs=n_out)
def test_join_missing_output_implies_no_code_success(): arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner', 'left_attributes': ['homedest'], 'right_attributes': ['embarked'] }, 'named_inputs': {}, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) assert instance.generate_code() is None
def test_join_merge_outer_parameter_success(): """ there's a line of code that replaces '_outer' to '' """ df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df1 = df1.copy() test_df2 = df2.copy() arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': 'outer', 'left_attributes': ['homedest'], 'right_attributes': ['embarked'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) cols1 = [c + '_l' for c in test_df1.columns] cols2 = [c + '_r' for c in test_df2.columns] test_df1.columns = cols1 test_df2.columns = cols2 keys1 = [c + '_l' for c in ['homedest']] keys2 = [c + '_r' for c in ['embarked']] test_out = pd.merge(test_df1, test_df2, how='outer', suffixes=['_l', '_r'], left_on=keys1, right_on=keys2) cols_to_remove = keys2 test_out.drop(cols_to_remove, axis=1, inplace=True) assert result['out'].equals(test_out)
def test_join_success(): slice_size = 10 df = [ 'df', util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], slice_size) ] arguments = { 'parameters': {}, 'named_inputs': { 'input data': df[0], }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), dict([df])) assert result['out'].equals(util.iris(size=slice_size))
def test_join_inner_join_minimal_with_remove_right_columns_success(): params = { 'left_attributes': ['id', 'cod'], 'right_attributes': ['id', 'cod'], 'aliases': '_left,_right' } n_in = {'input data 1': 'df1', 'input data 2': 'df2'} n_out = {'output data': 'out'} instance = JoinOperation(params, named_inputs=n_in, named_outputs=n_out) code = instance.generate_code() expected_code = dedent(""" cols_to_remove = [c+'_right' for c in df2.columns if c in df1.columns] out = pd.merge(df1, df2, how='inner', suffixes=['_left', '_right'], left_on=['id', 'cod'], right_on=['id', 'cod']) out.drop(cols_to_remove, axis=1, inplace=True)""") result, msg = compare_ast(ast.parse(code), ast.parse(expected_code)) assert result, msg + format_code_comparison(code, expected_code)
def test_join_invalid_right_attributes_param_fail(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner', 'left_attributes': ['homedest'], 'right_attributes': 'invalid' }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) with pytest.raises(NameError) as nam_err: util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) assert "invalid" in str(nam_err.value)
def test_join_missing_attributes_param_fail(): arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner' }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } with pytest.raises(ValueError) as val_err: JoinOperation(**arguments) assert "Parameters 'left_attributes' and 'right_attributes'" \ " must be informed for task" in str(val_err.value)