Esempio n. 1
0
def test_split_seed_param_success():
    """
    Seeds higher than the integer limit and lower than zero will be set to 0
    """
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=10)
    test_out = df.copy()
    test_out.index = [8, 4, 0, 7, 2, 9, 5, 6, 1, 3]
    test_out.sort_index(axis=0, inplace=True)
    test_out.index = [2, 8, 4, 9, 1, 6, 7, 3, 0, 5]
    arguments = {
        'parameters': {'seed': -1},
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df': df})

    assert len(result['split_1_task_1']) == 5
    assert len(result['split_2_task_1']) == 5
    assert test_out.iloc[:5, :].equals(result['split_1_task_1'])
    assert test_out.iloc[5:10, :].equals(result['split_2_task_1'])
Esempio n. 2
0
def test_split_no_output_implies_no_code_success():
    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
        }
    }
    instance = SplitOperation(**arguments)
    assert instance.generate_code() is None
Esempio n. 3
0
def test_split_missing_input_implies_no_code_success():
    arguments = {
        'parameters': {},
        'named_inputs': {
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    assert instance.generate_code() is None
Esempio n. 4
0
def test_random_split_params_success():
    params = {'weights': '40.0', 'seed': '1234321'}
    n_in = {'input data': 'df1'}
    n_out = {'splitted data 1': 'out1', 'splitted data 2': 'out2'}

    instance = SplitOperation(params, named_inputs=n_in, named_outputs=n_out)
    code = instance.generate_code()

    expected_code = """{out1}, {out2} = np.split({input}.sample(frac=1, 
    random_state={seed}), [int({weights}*len({input}))])
    """.format(out1=n_out['splitted data 1'],
               out2=n_out['splitted data 2'],
               input=n_in['input data'],
               weights='0.4',
               seed=1234321)

    result, msg = compare_ast(ast.parse(code), ast.parse(expected_code))
    assert result, msg + format_code_comparison(code, expected_code)
Esempio n. 5
0
def test_split_invalid_seed_param_fail():
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=10)

    arguments = {
        'parameters': {'seed': 'invalid'},
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    with pytest.raises(NameError) as nam_err:
        util.execute(instance.generate_code(),
                     {'df': df})
    assert "name 'invalid' is not defined" in str(nam_err.value)
Esempio n. 6
0
def test_split_success():
    slice_size = 10
    df = [
        'df',
        util.iris(['sepallength', 'sepalwidth', 'petalwidth', 'petallength'],
                  slice_size)
    ]

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': df[0],
        },
        'named_outputs': {
            'output data': 'out'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(), dict([df]))
    assert result['out'].equals(util.iris(size=slice_size))
Esempio n. 7
0
def test_split_uneven_size_dataframe_success():
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=13)

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df': df})

    assert len(result['split_1_task_1']) == 6
    assert len(result['split_2_task_1']) == 7
Esempio n. 8
0
def test_split_weights_param_success():
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=50)

    arguments = {
        'parameters': {'weights': 36},
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df': df})

    assert len(result['split_1_task_1']) == 18
    assert len(result['split_2_task_1']) == 32
Esempio n. 9
0
def test_split_one_row_dataframe_success():
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=1)
    test_df = df.copy()

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df': df})
    assert result['split_2_task_1'].equals(test_df)
    assert len(result['split_2_task_1']) == 1
Esempio n. 10
0
def test_split_randomness_success():
    df = util.iris(['sepallength', 'sepalwidth',
                    'petallength', 'petalwidth'], size=10)
    test_df = df.copy()

    arguments = {
        'parameters': {},
        'named_inputs': {
            'input data': 'df',
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    instance = SplitOperation(**arguments)
    result = util.execute(instance.generate_code(),
                          {'df': df})

    test_out = pd.concat(
        [result['split_1_task_1'], result['split_2_task_1']])
    assert not test_out.equals(test_df)
Esempio n. 11
0
def test_split_invalid_weights_param_fail():
    arguments = {
        'parameters': {'weights': 'invalid'},
        'named_inputs': {
            'input data': 'df'
        },
        'named_outputs': {
            'split 1': 'split_1_task_1',
            'split 2': 'split_2_task_1'
        }
    }
    with pytest.raises(ValueError) as val_err:
        SplitOperation(**arguments)
    assert "could not convert string to float: 'invalid'" in str(
        val_err.value)