def test_discretize_features_one_feature():
    """Test discretize_features for just one feature."""
    recs = [{
        'test_feature_1': 1,
        'test_feature_2': 1,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 1,
        'test_class': 1
    }, {
        'test_feature_1': 3,
        'test_feature_2': 3,
        'test_class': 1
    }, {
        'test_feature_1': 5,
        'test_feature_2': 5,
        'test_class': 0
    }, {
        'test_feature_1': 5,
        'test_feature_2': 5,
        'test_class': 0
    }]
    test_df = pd.DataFrame(recs)
    obj = DataWrangling()
    return_val = obj.discretize_features(data_frame=test_df,
                                         features=['test_feature_1'],
                                         n_bins=3)

    expected_val = np.array([[1., 0., 0.], [1., 0., 0.], [0., 1., 0.],
                             [0., 0., 1.], [0., 0., 1.]])

    assert np.array_equal(return_val, expected_val)
def test_fill_nul_vals():
    """Test fill_null_vals()."""
    recs = [{
        'test_feature_1': 0,
        'test_feature_2': 0,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 5,
        'test_class': 1
    }, {
        'test_feature_1': 2,
        'test_feature_2': 0,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 3,
        'test_class': 0
    }, {
        'test_feature_1': 0,
        'test_feature_2': 2,
        'test_class': 0
    }]
    test_df = pd.DataFrame(recs)

    obj = DataWrangling()
    return_val = obj.fill_nul_vals(
        data_frame=test_df,
        class_column='test_class',
        features=['test_feature_1', 'test_feature_2'])

    recs = [{
        'test_feature_1': 1.5,
        'test_feature_2': 5.0,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 5,
        'test_class': 1
    }, {
        'test_feature_1': 2,
        'test_feature_2': 5.0,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 3,
        'test_class': 0
    }, {
        'test_feature_1': 1.0,
        'test_feature_2': 2,
        'test_class': 0
    }]
    expected_val = pd.DataFrame(recs)

    assert return_val.equals(expected_val)
def test_split_train_test():
    """Test split_train_test()."""
    recs = [{
        'test_feature_1': 1,
        'test_feature_2': 1,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 1,
        'test_class': 1
    }, {
        'test_feature_1': 3,
        'test_feature_2': 3,
        'test_class': 1
    }, {
        'test_feature_1': 5,
        'test_feature_2': 5,
        'test_class': 0
    }, {
        'test_feature_1': 5,
        'test_feature_2': 5,
        'test_class': 0
    }]
    test_df = pd.DataFrame(recs)
    features = ['test_feature_1', 'test_feature_2']
    test_size = 0.33
    class_col = 'test_class'
    obj = DataWrangling()
    return_val = obj.split_train_test(x_data=test_df[features].to_numpy(),
                                      y_labels=test_df[class_col].to_numpy(),
                                      features=features,
                                      test_size=test_size)
    expected_keys = ['x_train', 'y_train', 'x_test', 'y_test']
    assert all(key in return_val for key in expected_keys)
    assert all(
        isinstance(val, (pd.Series, pd.DataFrame))
        for _, val in return_val.items())
Esempio n. 4
0
def test_sklearn_prediction_score():
    global data, features, class_col, n_bins, test_size, no_zero_features
    obj = DataWrangling()
    splits = preprocess_data(obj=obj,
                             data=data,
                             features=features,
                             no_zero_features=no_zero_features,
                             class_col=class_col,
                             n_bins=n_bins,
                             test_size=test_size)
    prediction_score = get_sklearn_prediction_score(splits)
    splits = splits['x_train']
    assert isinstance(prediction_score, float)
    assert prediction_score >= 0.0
    assert prediction_score <= 1.0
Esempio n. 5
0
def test_preprocess_data():
    global data, features, class_col, n_bins, test_size, no_zero_features
    obj = DataWrangling()
    return_val = preprocess_data(obj=obj,
                                 data=data,
                                 features=features,
                                 no_zero_features=no_zero_features,
                                 class_col=class_col,
                                 n_bins=n_bins,
                                 test_size=test_size)

    expected_keys = ['x_train', 'y_train', 'x_test', 'y_test']
    assert all(key in return_val for key in expected_keys)
    assert all(
        isinstance(val, (pd.Series, pd.DataFrame))
        for _, val in return_val.items())

    assert return_val['x_train'].shape[0] == 4
    assert return_val['x_test'].shape[0] == 2
    assert return_val['x_train'].shape[1] == 6
Esempio n. 6
0
def get_data(n_bins: int) -> dict:
    """Retrieve data and perform preprocessing steps and return result.

    Args:
        n_bins (int): Number of bins to discretize features into.

    Returns:
        dict: Dictionary containing test and training sets and labels.

    """
    dframe = pd.read_csv(TEST_DATA_PATH)
    obj1 = DataWrangling(data_path=TEST_DATA_PATH)
    obj1.load_data()
    features = [
        'Pglucose', 'Preg', 'Dbp', 'BMI', 'Dpfunc', 'Age', 'Insulin', 'Skin'
    ]

    input_df = obj1.data[features + ['Response']].astype(np.float64)
    no_zero_vals = obj1.fill_nul_vals(data_frame=input_df,
                                      class_column='Response',
                                      features=features)
    new_feature_list = []
    for feature in features:
        for bin_val in range(1, n_bins + 1):
            new_feature_list.append(feature + str(bin_val))
    discr_arr = obj1.discretize_features(data_frame=no_zero_vals,
                                         features=features,
                                         n_bins=n_bins)
    discr_df = pd.DataFrame(discr_arr, columns=new_feature_list)
    discr_df['Response'] = dframe['Response']
    splits = obj1.split_train_test(
        x_data=discr_df[new_feature_list].to_numpy(),
        y_labels=discr_df['Response'].to_numpy(),
        features=new_feature_list,
        test_size=0.25)
    return splits
def test_load_data():
    """Test load_data()."""
    obj = DataWrangling(data_path='src/data/data.csv')
    obj.load_data()
    assert all(obj.data.columns == pd.read_csv('src/data/data.csv').columns)