def test_feature_assembler_invalid_dtype_input_fail(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) test_out = { 'Feat': [[3.1, 0.2], [3.5, 0.3], [3.3, 0.4], [3.1, 0.2], [3.6, 0.2], [3.9, 0.4], [3.4, 0.3], [3.4, 0.2], [2.9, 0.2], [3.1, 0.1]] } test_out_2 = { 'Feat2': ['3.1', '0.3', '0.4', '0.2', '3.6', '3.9', '0.3', '0.2', '2.9', '0.1'] } test_out = pd.DataFrame(test_out) test_out_2 = pd.DataFrame(test_out_2) df = pd.concat([df, test_out, test_out_2], axis=1, join='inner') arguments = { 'parameters': { 'attributes': ['sepalwidth', 'Feat', 'Feat2'], 'multiplicity': { 'input data': 0 } }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = FeatureAssemblerOperation(**arguments) with pytest.raises(ValueError) as val_err: util.execute(instance.generate_code(), {'df': df}) assert "Input 'df' must contain numeric values only for task" in str( val_err.value)
def test_execute_python_dangerous_zfill_method_success(): # The zfill() can cause a crash arguments = { 'parameters': {'code': dedent(""" str_ing = '' str_ing = str_ing.zfill(100) # Example on how it can crash/overflow # str_ing = str_ing.zfill(10000000000) print(str_ing) """), 'task': {'id': 0}}, 'named_inputs': { 'input data 1': None, 'input data 2': None }, 'named_outputs': { 'output data 1': 'out1', 'output data 2': 'out2' } } output = [] instance = ExecutePythonOperation(**arguments) util.execute(instance.generate_code(), {'task_futures': {'items': TestingBypass}, 'emit_event': _emit_event(output)}) assert output[0]['message'] == ''.zfill(100) + '\n'
def test_execute_python_big_or_infinite_loops_success(): """ The user can create big or infinite loops Uncomment the code in dedent() method to test """ arguments = { 'parameters': {'code': dedent(""" # Example 1: # for i in range(100000000000000000): # pass # Example 2: # while True: # pass """), 'task': {'id': 0}}, 'named_inputs': { 'input data 1': None, 'input data 2': None }, 'named_outputs': { 'output data 1': 'out1', 'output data 2': 'out2' } } output = [] instance = ExecutePythonOperation(**arguments) util.execute(instance.generate_code(), {'task_futures': {'items': TestingBypass}, 'emit_event': _emit_event(output)})
def test_execute_python_prohibited_python_keywords_fail(): """ 'class', 'nonlocal', 'import', 'from' and 'as' are prohibited """ arguments = { 'parameters': {'code': dedent(""" from math import inf class FailClass: def failfunc(): nonlocal x x = 10 """), 'task': {'id': 0}}, 'named_inputs': { 'input data 1': None, 'input data 2': None }, 'named_outputs': { 'output data 1': 'out1', 'output data 2': 'out2' } } output = [] instance = ExecutePythonOperation(**arguments) with pytest.raises(SyntaxError) as syn_err: util.execute(instance.generate_code(), {'task_futures': {'items': TestingBypass}, 'emit_event': _emit_event(output)}) assert "Nonlocal statements are not allowed." in str(syn_err.value)
def test_join_outer_replace_success(): """ This only happens when you pass '_outer' """ df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': '_outer', 'left_attributes': ['homedest'], 'right_attributes': ['embarked'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) with pytest.raises(KeyError) as key_err: util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) assert '' in str(key_err.value)
def test_execute_python_prohibited_data_types_fail(): """ 'byte_array' and 'memory_view' are prohibited """ arguments = { 'parameters': {'code': dedent(""" byte_array = bytearray(5) memory_view = memoryview(bytes(5)) """), 'task': {'id': 0}}, 'named_inputs': { 'input data 1': None, 'input data 2': None }, 'named_outputs': { 'output data 1': 'out1', 'output data 2': 'out2' } } output = [] instance = ExecutePythonOperation(**arguments) with pytest.raises(ValueError) as val_err: util.execute(instance.generate_code(), {'task_futures': {'items': TestingBypass}, 'emit_event': _emit_event(output)}) assert "name 'bytearray' is not defined." \ " Many Python commands are not available in Lemonade" in str( val_err.value)
def test_aggregation_asterisk_success(): df = util.iris(['class'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['class'], 'function': [{ 'attribute': '*', 'f': 'count', 'alias': 'class_count' }] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby(['class' ]).agg(class_count=('class', 'count')).reset_index() assert result['out'].equals(test_out)
def test_add_columns_same_aliases_param_values_fail(): """ (?) Passing the same aliases to the attributes is allowed """ left_df = util.iris(['sepallength', 'sepalwidth'], size=10) right_df = util.iris(['sepallength', 'sepalwidth'], size=10) test_df = util.iris( ['sepallength', 'sepalwidth', 'sepallength', 'sepalwidth'], size=10) arguments = { 'parameters': {'aliases': '_col,_col'}, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = AddColumnsOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': left_df, 'df2': right_df}) test_df.columns = ['sepallength_col', 'sepalwidth_col', 'sepallength_col', 'sepalwidth_col'] assert result['out'].equals(test_df)
def test_aggregation_non_numeric_attributes_success(): df = util.titanic(['homedest'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['homedest'], 'function': return_funcs('homedest', drop='avg') }, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby(['homedest']).agg( home_collect_list=('homedest', _collect_list), home_collect_set=('homedest', _collect_set), home_count=('homedest', 'count'), home_first=('homedest', 'first'), home_last=('homedest', 'last'), home_max=('homedest', 'max'), home_min=('homedest', 'min'), home_sum=('homedest', 'sum'), home_size=('homedest', 'size')).reset_index() assert result['out'].equals(test_out)
def test_join_match_case_param_success(): """ Match case converts a column to lower then, it adds a _lower to the column name and finally it drops the column. (Seems redundant...) """ df1 = util.titanic(['name', 'embarked'], size=10) df2 = util.titanic(['homedest', 'name'], size=10) test_df = util.titanic(['name', 'embarked', 'homedest'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': False, 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = ['name_l', 'embarked_l', 'homedest_r'] assert result['out'].equals(test_df)
def test_join_krk_param_success(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df = util.titanic(['name', 'homedest', 'embarked', 'name'], size=10) arguments = { 'parameters': { 'keep_right_keys': 1, 'match_case': '1', 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'] }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = ['name_l', 'homedest_l', 'embarked_r', 'name_r'] assert result['out'].equals(test_df)
def test_join_custom_suffixes_success(): df1 = util.titanic(['name', 'homedest'], size=10) df2 = util.titanic(['embarked', 'name'], size=10) test_df = util.titanic(['name', 'homedest', 'embarked'], size=10) arguments = { 'parameters': { 'keep_right_keys': False, 'match_case': True, 'join_type': 'inner', 'left_attributes': ['name'], 'right_attributes': ['name'], 'aliases': '_esquerdo,_direito' }, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = JoinOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) test_df.columns = [ 'name_esquerdo', 'homedest_esquerdo', 'embarked_direito' ] assert result['out'].equals(test_df)
def test_aggregation_multiple_functions_success(): df = util.iris(['class', 'sepalwidth'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['class'], 'function': return_funcs('sepalwidth') }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_out.groupby('class').agg( sepal_avg=('sepalwidth', 'mean'), sepal_collect_list=('sepalwidth', _collect_list), sepal_collect_set=('sepalwidth', _collect_set), sepal_count=('sepalwidth', 'count'), sepal_first=('sepalwidth', 'first'), sepal_last=('sepalwidth', 'last'), sepal_max=('sepalwidth', 'max'), sepal_min=('sepalwidth', 'min'), sepal_sum=('sepalwidth', 'sum'), sepal_size=('sepalwidth', 'size')).reset_index() assert result['out'].equals(test_out)
def test_max_abs_scaler_success(): df = util.iris(['sepalwidth', 'petalwidth'], size=10) test_df = df.copy() arguments = { 'parameters': { 'attribute': ['sepalwidth', 'petalwidth'], 'multiplicity': { 'input data': 0 } }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = MaxAbsScalerOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) data = {'scaled_1': scaler(df, ['sepalwidth', 'petalwidth'])} data = pd.DataFrame(data) assert test_df.max()['sepalwidth'] == 3.9 assert test_df.max()['petalwidth'] == 0.4 assert result['out'].loc[:, 'scaled_1'].equals(data.loc[:, 'scaled_1'])
def test_union_uneven_dataframe_sizes_success(): df1 = util.iris([ 'sepallength', 'sepalwidth', ], size=5) df2 = util.iris(['petalwidth', 'petallength'], size=10) test_df1 = df1.copy() test_df2 = df2.copy() arguments = { 'parameters': {}, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2', }, 'named_outputs': { 'output data': 'out' } } instance = UnionOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) assert len(result['out']) == 15 test_out = pd.concat([test_df1, test_df2], sort=False, axis=0, ignore_index=True) assert result['out'].equals(test_out)
def test_split_seed_param_success(): """ Seeds higher than the integer limit and lower than zero will be set to 0 """ df = util.iris(['sepallength', 'sepalwidth', 'petallength', 'petalwidth'], size=10) test_out = df.copy() test_out.index = [8, 4, 0, 7, 2, 9, 5, 6, 1, 3] test_out.sort_index(axis=0, inplace=True) test_out.index = [2, 8, 4, 9, 1, 6, 7, 3, 0, 5] arguments = { 'parameters': {'seed': -1}, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'split 1': 'split_1_task_1', 'split 2': 'split_2_task_1' } } instance = SplitOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert len(result['split_1_task_1']) == 5 assert len(result['split_2_task_1']) == 5 assert test_out.iloc[:5, :].equals(result['split_1_task_1']) assert test_out.iloc[5:10, :].equals(result['split_2_task_1'])
def test_execute_python_pandas_success(): """the user can use pretty much every method from pandas, this may cause problems because of the quantity of methods and future methods that will be added""" df1 = util.iris(['class', 'petalwidth'], size=10) df2 = util.iris(['class'], size=10) test_df = df1.copy() arguments = { 'parameters': {'code': dedent(""" out1 = in1.drop(columns=['class']) """), 'task': {'id': 0}}, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data 1': 'out1', 'output data 2': 'out2' } } output = [] instance = ExecutePythonOperation(**arguments) result = util.execute(instance.generate_code(), {'task_futures': {'items': TestingBypass}, 'df1': df1, 'df2': df2, 'emit_event': _emit_event(output)}) assert result['out1'].equals(test_df.drop(columns=['class']))
def test_difference_big_variation_success(): df1 = util.iris(['petalwidth'], size=40) df2 = util.iris(['petalwidth'], size=10) test_df = df1.copy() df1.loc[4, 'petalwidth'] = np.int64(50) df1.loc[5, 'petalwidth'] = pd.Timestamp(1596509236) df1.loc[6, 'petalwidth'] = np.float(1.56) df1.loc[7, 'petalwidth'] = np.array('test') df1.loc[8, 'petalwidth'] = np.bool(False) df1.loc[10, 'petalwidth'] = np.NaN arguments = { 'parameters': {}, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = DifferenceOperation(**arguments) result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2}) diff_oper = df1.eq(df2) for i in range(40): if diff_oper.iloc[i, 0:].all(): test_df.drop(i, inplace=True) assert result['out'].eq(test_df).equals(test_df.eq(result['out'])) assert len(result['out']) == 35
def test_aggregation_pivot_table_success(): df = util.iris(['class', 'sepalwidth', 'petalwidth'], size=150) test_out = df.copy() arguments = { 'parameters': { 'attributes': ['petalwidth'], 'function': [{ 'attribute': 'petalwidth', 'f': 'count' }], 'pivot': ['class'], }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) aggfunc = {"petalwidth": ['count']} test_out = pd.pivot_table(test_out, index=['petalwidth'], columns=['class'], aggfunc=aggfunc) test_out.reset_index(inplace=True) new_idx = [ n[0] if n[1] == '' else "%s_%s_%s" % (n[0], n[1], n[2]) for n in test_out.columns ] test_out.columns = new_idx assert result['out'].equals(test_out)
def test_clean_missing_multiple_attributes_success(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) df.loc[0:1, 'sepallength'] = np.NaN df.loc[2:3, 'sepalwidth'] = np.NaN df.loc[4:5, 'petalwidth'] = np.NaN df.loc[6:7, 'petallength'] = np.NaN arguments = { 'parameters': { 'attributes': ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], 'cleaning_mode': 'REMOVE_ROW' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert result['output_data_1'].equals( util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10).drop(index=[i for i in range(8)]))
def test_clean_missing_ratio_control_success(): """ Needs a better assertion... Ratio method is confusing. """ df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) df.loc[:, ['sepallength', 'sepalwidth']] = np.NaN df.loc[0, 'petalwidth'] = np.NaN test = util.iris( ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) test.loc[:, ['sepallength', 'sepalwidth']] = np.NaN test.loc[0, 'petalwidth'] = np.NaN arguments = { 'parameters': { 'attributes': ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], 'min_missing_ratio': 0.025, 'max_missing_ratio': 0.1, 'cleaning_mode': 'REMOVE_COLUMN' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = CleanMissingOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) assert result['output_data_1'].equals(test.drop(columns=['petalwidth']))
def test_sample_or_partition_seed_success(): """ seeds 4294967296 or higher (integer limit) will be set to 0 seeds lower than 0 will be set to 0 """ df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) test_df = df.copy() arguments = { 'parameters': { 'type': 'value', 'seed': 4294967296, 'value': 10 }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'sampled data': 'out' } } instance = SampleOrPartitionOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = test_df.sample(n=10, random_state=0) assert result['out'].equals(test_out)
def test_feature_assembler_alias_param_success(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) test_df = df.copy() arguments = { 'parameters': { 'attributes': ['sepalwidth', 'petalwidth'], 'multiplicity': { 'input data': 0 }, 'alias': 'Feat' }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = FeatureAssemblerOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) test_out = { 'Feat': [[3.5, 0.2], [3.0, 0.2], [3.2, 0.2], [3.1, 0.2], [3.6, 0.2], [3.9, 0.4], [3.4, 0.3], [3.4, 0.2], [2.9, 0.2], [3.1, 0.1]] } test_out = pd.DataFrame(test_out) test_out = pd.concat([test_df, test_out], axis=1) assert result['out'].equals(test_out)
def test_select_fail_invalid_named_inputs(): arguments = { 'parameters': { 'attributes': ['sepallength'] }, 'named_inputs': { 'input data': 'error', }, 'named_outputs': { 'output projected data': 'out' } } instance = SelectOperation(**arguments) with pytest.raises(TypeError) as typ_err: util.execute(instance.generate_code(), {'error': []}) assert 'list indices must be integers or slices, not list' in str( typ_err.value)
def test_drop_invalid_attribute_param_fail(): df = util.iris(size=10) arguments = { 'parameters': { 'attributes': ['invalid'] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = DropOperation(**arguments) with pytest.raises(KeyError) as key_err: util.execute(instance.generate_code(), {'df': df}) assert "['invalid'] not found in axis" in str(key_err.value)
def test_execute_sql_column_not_found_fail(): df1 = util.iris(['class', 'sepalwidth'], size=10) arguments = { 'parameters': { 'query': 'SELECT unknown FROM ds1' }, 'named_inputs': { 'input data 1': 'df1', }, 'named_outputs': { 'output data': 'out' } } instance = ExecuteSQLOperation(**arguments) with pytest.raises(pandasql.PandaSQLException) as psql_err: util.execute(instance.generate_code(), {'df1': df1}) assert "(sqlite3.OperationalError) no such column: unknown" in str( psql_err.value)
def test_feature_assembler_missing_multiplicity_param_fail(): df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10) arguments = { 'parameters': { 'attributes': ['sepalwidth', 'petalwidth'] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = FeatureAssemblerOperation(**arguments) with pytest.raises(KeyError) as key_err: util.execute(instance.generate_code(), {'df': df}) assert "'multiplicity'" in str(key_err.value)
def test_balanced_split_k_fold_shuffle_stratified_success(): """ Same as balanced_stratified, shuffle doesn't make a difference if stratified """ df = util.iris(['class'], size=30) df.loc[10:20, 'class'] = 'Iris-versicolor' df.loc[20:30, 'class'] = 'Iris-virginica' test_df = df.copy() arguments = { 'parameters': { 'n_splits': 3, 'shuffle': 1, 'attribute': 'groups', 'stratified': 1, 'random_state': 0, 'column': ['class'] }, 'named_inputs': { 'input data': 'df', }, 'named_outputs': { 'output data': 'out' } } instance = SplitKFoldOperation(**arguments) result = util.execute(instance.generate_code(), {'df': df}) data = { "groups": [ 0, 2, 1, 2, 0, 1, 2, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 2, 1, 0, 1, 2, 1, 1, 0, 0, 2, 2, 2 ] } test_out = pd.concat([test_df, pd.DataFrame(data)], axis=1) count_percent = percent(result['out'], 3) assert [ count_percent['group0']['Iris-setosa'], count_percent['group1']['Iris-setosa'], count_percent['group2']['Iris-setosa'] ] == [40, 30, 30] assert [ count_percent['group0']['Iris-versicolor'], count_percent['group1']['Iris-versicolor'], count_percent['group2']['Iris-versicolor'] ] == [30, 40, 30] assert [ count_percent['group0']['Iris-virginica'], count_percent['group1']['Iris-virginica'], count_percent['group2']['Iris-virginica'] ] == [30, 30, 40] assert result['out'].equals(test_out)
def test_add_columns_invalid_aliases_param_value_fail(): left_df = util.iris(['sepallength', 'sepalwidth'], size=10) right_df = util.iris(['sepallength', 'sepalwidth'], size=10) arguments = { 'parameters': {'aliases': 'invalid'}, 'named_inputs': { 'input data 1': 'df1', 'input data 2': 'df2' }, 'named_outputs': { 'output data': 'out' } } instance = AddColumnsOperation(**arguments) with pytest.raises(IndexError) as idx_err: util.execute(instance.generate_code(), {'df1': left_df, 'df2': right_df}) assert 'list index out of range' in str(idx_err.value)
def test_aggregation_non_numeric_attributes_fail(): df = util.titanic(['homedest'], size=150) arguments = { 'parameters': { 'attributes': ['homedest'], 'function': return_funcs('homedest') }, 'named_inputs': { 'input data': 'df' }, 'named_outputs': { 'output data': 'out' } } instance = AggregationOperation(**arguments) with pytest.raises(pd.core.base.DataError) as data_err: util.execute(instance.generate_code(), {'df': df}) assert "No numeric types to aggregate" in str(data_err.value)