def test_feature_assembler_invalid_dtype_input_fail():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    test_out = {
        'Feat': [[3.1, 0.2], [3.5, 0.3], [3.3, 0.4], [3.1, 0.2], [3.6, 0.2],
                 [3.9, 0.4], [3.4, 0.3], [3.4, 0.2], [2.9, 0.2], [3.1, 0.1]]
    }
    test_out_2 = {
        'Feat2':
        ['3.1', '0.3', '0.4', '0.2', '3.6', '3.9', '0.3', '0.2', '2.9', '0.1']
    }
    test_out = pd.DataFrame(test_out)
    test_out_2 = pd.DataFrame(test_out_2)
    df = pd.concat([df, test_out, test_out_2], axis=1, join='inner')
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth', 'Feat', 'Feat2'],
            'multiplicity': {
                'input data': 0
            }
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = FeatureAssemblerOperation(**arguments)
    with pytest.raises(ValueError) as val_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "Input 'df' must contain numeric values only for task" in str(
        val_err.value)
Esempio n. 2
0
def test_execute_python_dangerous_zfill_method_success():
    # The zfill() can cause a crash
    arguments = {
        'parameters': {'code': dedent("""
        str_ing = ''
        str_ing = str_ing.zfill(100)

        # Example on how it can crash/overflow
        # str_ing = str_ing.zfill(10000000000)

        print(str_ing)
        """),
                       'task': {'id': 0}},
        'named_inputs': {
            'input data 1': None,
            'input data 2': None
        },
        'named_outputs': {
            'output data 1': 'out1',
            'output data 2': 'out2'
        }
    }

    output = []
    instance = ExecutePythonOperation(**arguments)
    util.execute(instance.generate_code(),
                 {'task_futures': {'items': TestingBypass},
                  'emit_event': _emit_event(output)})

    assert output[0]['message'] == ''.zfill(100) + '\n'
Esempio n. 3
0
def test_execute_python_big_or_infinite_loops_success():
    """
    The user can create big or infinite loops
    Uncomment the code in dedent() method to test
    """
    arguments = {
        'parameters': {'code': dedent("""
        # Example 1:
        # for i in range(100000000000000000):
        #     pass
        # Example 2:
        # while True:
        #     pass
        """),
                       'task': {'id': 0}},
        'named_inputs': {
            'input data 1': None,
            'input data 2': None
        },
        'named_outputs': {
            'output data 1': 'out1',
            'output data 2': 'out2'
        }
    }

    output = []
    instance = ExecutePythonOperation(**arguments)
    util.execute(instance.generate_code(),
                 {'task_futures': {'items': TestingBypass},
                  'emit_event': _emit_event(output)})
Esempio n. 4
0
def test_execute_python_prohibited_python_keywords_fail():
    """
    'class', 'nonlocal', 'import', 'from' and 'as' are prohibited
    """
    arguments = {
        'parameters': {'code': dedent("""
        from math import inf
        class FailClass:
            def failfunc():
                nonlocal x
                x = 10
        """),
                       'task': {'id': 0}},
        'named_inputs': {
            'input data 1': None,
            'input data 2': None
        },
        'named_outputs': {
            'output data 1': 'out1',
            'output data 2': 'out2'
        }
    }
    output = []
    instance = ExecutePythonOperation(**arguments)
    with pytest.raises(SyntaxError) as syn_err:
        util.execute(instance.generate_code(),
                     {'task_futures': {'items': TestingBypass},
                      'emit_event': _emit_event(output)})
    assert "Nonlocal statements are not allowed." in str(syn_err.value)
Esempio n. 5
0
def test_join_outer_replace_success():
    """
    This only happens when you pass '_outer'
    """
    df1 = util.titanic(['name', 'homedest'], size=10)
    df2 = util.titanic(['embarked', 'name'], size=10)

    arguments = {
        'parameters': {
            'keep_right_keys': False,
            'match_case': True,
            'join_type': '_outer',
            'left_attributes': ['homedest'],
            'right_attributes': ['embarked']
        },
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = JoinOperation(**arguments)
    with pytest.raises(KeyError) as key_err:
        util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})
    assert '' in str(key_err.value)
Esempio n. 6
0
def test_execute_python_prohibited_data_types_fail():
    """
    'byte_array' and 'memory_view' are prohibited
    """
    arguments = {
        'parameters': {'code': dedent("""
        byte_array = bytearray(5)
        memory_view = memoryview(bytes(5))
        """),
                       'task': {'id': 0}},
        'named_inputs': {
            'input data 1': None,
            'input data 2': None
        },
        'named_outputs': {
            'output data 1': 'out1',
            'output data 2': 'out2'
        }
    }
    output = []
    instance = ExecutePythonOperation(**arguments)
    with pytest.raises(ValueError) as val_err:
        util.execute(instance.generate_code(),
                     {'task_futures': {'items': TestingBypass},
                      'emit_event': _emit_event(output)})
    assert "name 'bytearray' is not defined." \
           " Many Python commands are not available in Lemonade" in str(
        val_err.value)
Esempio n. 7
0
def test_aggregation_asterisk_success():
    df = util.iris(['class'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': [{
                'attribute': '*',
                'f': 'count',
                'alias': 'class_count'
            }]
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    test_out = test_out.groupby(['class'
                                 ]).agg(class_count=('class',
                                                     'count')).reset_index()
    assert result['out'].equals(test_out)
Esempio n. 8
0
def test_add_columns_same_aliases_param_values_fail():
    """
    (?)
    Passing the same aliases to the attributes is allowed
    """
    left_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    right_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    test_df = util.iris(
        ['sepallength', 'sepalwidth', 'sepallength', 'sepalwidth'], size=10)
    arguments = {
        'parameters': {'aliases': '_col,_col'},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AddColumnsOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df1': left_df, 'df2': right_df})
    test_df.columns = ['sepallength_col', 'sepalwidth_col',
                       'sepallength_col', 'sepalwidth_col']
    assert result['out'].equals(test_df)
Esempio n. 9
0
def test_aggregation_non_numeric_attributes_success():
    df = util.titanic(['homedest'], size=150)
    test_out = df.copy()

    arguments = {
        'parameters': {
            'attributes': ['homedest'],
            'function': return_funcs('homedest', drop='avg')
        },
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})

    test_out = test_out.groupby(['homedest']).agg(
        home_collect_list=('homedest', _collect_list),
        home_collect_set=('homedest', _collect_set),
        home_count=('homedest', 'count'),
        home_first=('homedest', 'first'),
        home_last=('homedest', 'last'),
        home_max=('homedest', 'max'),
        home_min=('homedest', 'min'),
        home_sum=('homedest', 'sum'),
        home_size=('homedest', 'size')).reset_index()
    assert result['out'].equals(test_out)
Esempio n. 10
0
def test_join_match_case_param_success():
    """
    Match case converts a column to lower then, it adds a _lower to the column
    name and finally it drops the column. (Seems redundant...)
    """
    df1 = util.titanic(['name', 'embarked'], size=10)
    df2 = util.titanic(['homedest', 'name'], size=10)
    test_df = util.titanic(['name', 'embarked', 'homedest'], size=10)

    arguments = {
        'parameters': {
            'keep_right_keys': False,
            'match_case': False,
            'join_type': 'inner',
            'left_attributes': ['name'],
            'right_attributes': ['name']
        },
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = JoinOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    test_df.columns = ['name_l', 'embarked_l', 'homedest_r']
    assert result['out'].equals(test_df)
Esempio n. 11
0
def test_join_krk_param_success():
    df1 = util.titanic(['name', 'homedest'], size=10)
    df2 = util.titanic(['embarked', 'name'], size=10)
    test_df = util.titanic(['name', 'homedest', 'embarked', 'name'], size=10)

    arguments = {
        'parameters': {
            'keep_right_keys': 1,
            'match_case': '1',
            'join_type': 'inner',
            'left_attributes': ['name'],
            'right_attributes': ['name']
        },
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = JoinOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})
    test_df.columns = ['name_l', 'homedest_l', 'embarked_r', 'name_r']
    assert result['out'].equals(test_df)
Esempio n. 12
0
def test_join_custom_suffixes_success():
    df1 = util.titanic(['name', 'homedest'], size=10)
    df2 = util.titanic(['embarked', 'name'], size=10)
    test_df = util.titanic(['name', 'homedest', 'embarked'], size=10)

    arguments = {
        'parameters': {
            'keep_right_keys': False,
            'match_case': True,
            'join_type': 'inner',
            'left_attributes': ['name'],
            'right_attributes': ['name'],
            'aliases': '_esquerdo,_direito'
        },
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = JoinOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})
    test_df.columns = [
        'name_esquerdo', 'homedest_esquerdo', 'embarked_direito'
    ]
    assert result['out'].equals(test_df)
Esempio n. 13
0
def test_aggregation_multiple_functions_success():
    df = util.iris(['class', 'sepalwidth'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['class'],
            'function': return_funcs('sepalwidth')
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    test_out = test_out.groupby('class').agg(
        sepal_avg=('sepalwidth', 'mean'),
        sepal_collect_list=('sepalwidth', _collect_list),
        sepal_collect_set=('sepalwidth', _collect_set),
        sepal_count=('sepalwidth', 'count'),
        sepal_first=('sepalwidth', 'first'),
        sepal_last=('sepalwidth', 'last'),
        sepal_max=('sepalwidth', 'max'),
        sepal_min=('sepalwidth', 'min'),
        sepal_sum=('sepalwidth', 'sum'),
        sepal_size=('sepalwidth', 'size')).reset_index()
    assert result['out'].equals(test_out)
Esempio n. 14
0
def test_max_abs_scaler_success():
    df = util.iris(['sepalwidth', 'petalwidth'], size=10)
    test_df = df.copy()

    arguments = {
        'parameters': {
            'attribute': ['sepalwidth', 'petalwidth'],
            'multiplicity': {
                'input data': 0
            }
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = MaxAbsScalerOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    data = {'scaled_1': scaler(df, ['sepalwidth', 'petalwidth'])}
    data = pd.DataFrame(data)

    assert test_df.max()['sepalwidth'] == 3.9
    assert test_df.max()['petalwidth'] == 0.4
    assert result['out'].loc[:, 'scaled_1'].equals(data.loc[:, 'scaled_1'])
Esempio n. 15
0
def test_union_uneven_dataframe_sizes_success():
    df1 = util.iris([
        'sepallength',
        'sepalwidth',
    ], size=5)
    df2 = util.iris(['petalwidth', 'petallength'], size=10)
    test_df1 = df1.copy()
    test_df2 = df2.copy()

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = UnionOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    assert len(result['out']) == 15
    test_out = pd.concat([test_df1, test_df2],
                         sort=False,
                         axis=0,
                         ignore_index=True)
    assert result['out'].equals(test_out)
Esempio n. 16
0
def test_split_seed_param_success():
    """
    Seeds higher than the integer limit and lower than zero will be set to 0
    """
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=10)
    test_out = df.copy()
    test_out.index = [8, 4, 0, 7, 2, 9, 5, 6, 1, 3]
    test_out.sort_index(axis=0, inplace=True)
    test_out.index = [2, 8, 4, 9, 1, 6, 7, 3, 0, 5]
    arguments = {
        'parameters': {'seed': -1},
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df': df})

    assert len(result['split_1_task_1']) == 5
    assert len(result['split_2_task_1']) == 5
    assert test_out.iloc[:5, :].equals(result['split_1_task_1'])
    assert test_out.iloc[5:10, :].equals(result['split_2_task_1'])
Esempio n. 17
0
def test_execute_python_pandas_success():
    """the user can use pretty much every method from pandas, this may cause
    problems because of the quantity of methods and future methods that will
    be added"""
    df1 = util.iris(['class', 'petalwidth'], size=10)
    df2 = util.iris(['class'], size=10)
    test_df = df1.copy()
    arguments = {
        'parameters': {'code': dedent("""
        out1 = in1.drop(columns=['class'])
        """),
                       'task': {'id': 0}},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data 1': 'out1',
            'output data 2': 'out2'
        }
    }

    output = []
    instance = ExecutePythonOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'task_futures': {'items': TestingBypass},
                           'df1': df1,
                           'df2': df2,
                           'emit_event': _emit_event(output)})
    assert result['out1'].equals(test_df.drop(columns=['class']))
Esempio n. 18
0
def test_difference_big_variation_success():
    df1 = util.iris(['petalwidth'], size=40)
    df2 = util.iris(['petalwidth'], size=10)
    test_df = df1.copy()

    df1.loc[4, 'petalwidth'] = np.int64(50)
    df1.loc[5, 'petalwidth'] = pd.Timestamp(1596509236)
    df1.loc[6, 'petalwidth'] = np.float(1.56)
    df1.loc[7, 'petalwidth'] = np.array('test')
    df1.loc[8, 'petalwidth'] = np.bool(False)
    df1.loc[10, 'petalwidth'] = np.NaN

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DifferenceOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df1': df1, 'df2': df2})

    diff_oper = df1.eq(df2)
    for i in range(40):
        if diff_oper.iloc[i, 0:].all():
            test_df.drop(i, inplace=True)
    assert result['out'].eq(test_df).equals(test_df.eq(result['out']))
    assert len(result['out']) == 35
Esempio n. 19
0
def test_aggregation_pivot_table_success():
    df = util.iris(['class', 'sepalwidth', 'petalwidth'], size=150)
    test_out = df.copy()
    arguments = {
        'parameters': {
            'attributes': ['petalwidth'],
            'function': [{
                'attribute': 'petalwidth',
                'f': 'count'
            }],
            'pivot': ['class'],
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    aggfunc = {"petalwidth": ['count']}
    test_out = pd.pivot_table(test_out,
                              index=['petalwidth'],
                              columns=['class'],
                              aggfunc=aggfunc)
    test_out.reset_index(inplace=True)
    new_idx = [
        n[0] if n[1] == '' else "%s_%s_%s" % (n[0], n[1], n[2])
        for n in test_out.columns
    ]
    test_out.columns = new_idx
    assert result['out'].equals(test_out)
Esempio n. 20
0
def test_clean_missing_multiple_attributes_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[0:1, 'sepallength'] = np.NaN
    df.loc[2:3, 'sepalwidth'] = np.NaN
    df.loc[4:5, 'petalwidth'] = np.NaN
    df.loc[6:7, 'petallength'] = np.NaN
    arguments = {
        'parameters': {
            'attributes':
            ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
            'cleaning_mode':
            'REMOVE_ROW'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].equals(
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  size=10).drop(index=[i for i in range(8)]))
Esempio n. 21
0
def test_clean_missing_ratio_control_success():
    """
    Needs a better assertion...
    Ratio method is confusing.
    """
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    df.loc[:, ['sepallength', 'sepalwidth']] = np.NaN
    df.loc[0, 'petalwidth'] = np.NaN
    test = util.iris(
        ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'], size=10)
    test.loc[:, ['sepallength', 'sepalwidth']] = np.NaN
    test.loc[0, 'petalwidth'] = np.NaN

    arguments = {
        'parameters': {
            'attributes':
            ['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
            'min_missing_ratio':
            0.025,
            'max_missing_ratio':
            0.1,
            'cleaning_mode':
            'REMOVE_COLUMN'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = CleanMissingOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})
    assert result['output_data_1'].equals(test.drop(columns=['petalwidth']))
Esempio n. 22
0
def test_sample_or_partition_seed_success():
    """
    seeds 4294967296 or higher (integer limit) will be set to 0
    seeds lower than 0 will be set to 0
    """
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    test_df = df.copy()

    arguments = {
        'parameters': {
            'type': 'value',
            'seed': 4294967296,
            'value': 10
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'sampled data': 'out'
        }
    }
    instance = SampleOrPartitionOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})

    test_out = test_df.sample(n=10, random_state=0)

    assert result['out'].equals(test_out)
Esempio n. 23
0
def test_feature_assembler_alias_param_success():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    test_df = df.copy()

    arguments = {
        'parameters': {
            'attributes': ['sepalwidth', 'petalwidth'],
            'multiplicity': {
                'input data': 0
            },
            'alias': 'Feat'
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = FeatureAssemblerOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})

    test_out = {
        'Feat': [[3.5, 0.2], [3.0, 0.2], [3.2, 0.2], [3.1, 0.2], [3.6, 0.2],
                 [3.9, 0.4], [3.4, 0.3], [3.4, 0.2], [2.9, 0.2], [3.1, 0.1]]
    }
    test_out = pd.DataFrame(test_out)
    test_out = pd.concat([test_df, test_out], axis=1)
    assert result['out'].equals(test_out)
Esempio n. 24
0
def test_select_fail_invalid_named_inputs():
    arguments = {
        'parameters': {
            'attributes': ['sepallength']
        },
        'named_inputs': {
            'input data': 'error',
        },
        'named_outputs': {
            'output projected data': 'out'
        }
    }
    instance = SelectOperation(**arguments)
    with pytest.raises(TypeError) as typ_err:
        util.execute(instance.generate_code(), {'error': []})
    assert 'list indices must be integers or slices, not list' in str(
        typ_err.value)
Esempio n. 25
0
def test_drop_invalid_attribute_param_fail():
    df = util.iris(size=10)

    arguments = {
        'parameters': {
            'attributes': ['invalid']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = DropOperation(**arguments)
    with pytest.raises(KeyError) as key_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "['invalid'] not found in axis" in str(key_err.value)
Esempio n. 26
0
def test_execute_sql_column_not_found_fail():
    df1 = util.iris(['class', 'sepalwidth'], size=10)
    arguments = {
        'parameters': {
            'query': 'SELECT unknown FROM ds1'
        },
        'named_inputs': {
            'input data 1': 'df1',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = ExecuteSQLOperation(**arguments)
    with pytest.raises(pandasql.PandaSQLException) as psql_err:
        util.execute(instance.generate_code(), {'df1': df1})
    assert "(sqlite3.OperationalError) no such column: unknown" in str(
        psql_err.value)
Esempio n. 27
0
def test_feature_assembler_missing_multiplicity_param_fail():
    df = util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                   size=10)
    arguments = {
        'parameters': {
            'attributes': ['sepalwidth', 'petalwidth']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = FeatureAssemblerOperation(**arguments)
    with pytest.raises(KeyError) as key_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "'multiplicity'" in str(key_err.value)
Esempio n. 28
0
def test_balanced_split_k_fold_shuffle_stratified_success():
    """
    Same as balanced_stratified, shuffle doesn't
    make a difference if stratified
    """
    df = util.iris(['class'], size=30)
    df.loc[10:20, 'class'] = 'Iris-versicolor'
    df.loc[20:30, 'class'] = 'Iris-virginica'
    test_df = df.copy()

    arguments = {
        'parameters': {
            'n_splits': 3,
            'shuffle': 1,
            'attribute': 'groups',
            'stratified': 1,
            'random_state': 0,
            'column': ['class']
        },
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = SplitKFoldOperation(**arguments)
    result = util.execute(instance.generate_code(), {'df': df})

    data = {
        "groups": [
            0, 2, 1, 2, 0, 1, 2, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 1, 2, 1, 0, 1,
            2, 1, 1, 0, 0, 2, 2, 2
        ]
    }
    test_out = pd.concat([test_df, pd.DataFrame(data)], axis=1)
    count_percent = percent(result['out'], 3)

    assert [
        count_percent['group0']['Iris-setosa'],
        count_percent['group1']['Iris-setosa'],
        count_percent['group2']['Iris-setosa']
    ] == [40, 30, 30]

    assert [
        count_percent['group0']['Iris-versicolor'],
        count_percent['group1']['Iris-versicolor'],
        count_percent['group2']['Iris-versicolor']
    ] == [30, 40, 30]

    assert [
        count_percent['group0']['Iris-virginica'],
        count_percent['group1']['Iris-virginica'],
        count_percent['group2']['Iris-virginica']
    ] == [30, 30, 40]

    assert result['out'].equals(test_out)
Esempio n. 29
0
def test_add_columns_invalid_aliases_param_value_fail():
    left_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    right_df = util.iris(['sepallength', 'sepalwidth'], size=10)
    arguments = {
        'parameters': {'aliases': 'invalid'},
        'named_inputs': {
            'input data 1': 'df1',
            'input data 2': 'df2'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AddColumnsOperation(**arguments)
    with pytest.raises(IndexError) as idx_err:
        util.execute(instance.generate_code(),
                     {'df1': left_df, 'df2': right_df})
    assert 'list index out of range' in str(idx_err.value)
Esempio n. 30
0
def test_aggregation_non_numeric_attributes_fail():
    df = util.titanic(['homedest'], size=150)
    arguments = {
        'parameters': {
            'attributes': ['homedest'],
            'function': return_funcs('homedest')
        },
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = AggregationOperation(**arguments)
    with pytest.raises(pd.core.base.DataError) as data_err:
        util.execute(instance.generate_code(), {'df': df})
    assert "No numeric types to aggregate" in str(data_err.value)