def test_split_train_test():
    """Test split_train_test()."""
    recs = [{
        'test_feature_1': 1,
        'test_feature_2': 1,
        'test_class': 1
    }, {
        'test_feature_1': 1,
        'test_feature_2': 1,
        'test_class': 1
    }, {
        'test_feature_1': 3,
        'test_feature_2': 3,
        'test_class': 1
    }, {
        'test_feature_1': 5,
        'test_feature_2': 5,
        'test_class': 0
    }, {
        'test_feature_1': 5,
        'test_feature_2': 5,
        'test_class': 0
    }]
    test_df = pd.DataFrame(recs)
    features = ['test_feature_1', 'test_feature_2']
    test_size = 0.33
    class_col = 'test_class'
    obj = DataWrangling()
    return_val = obj.split_train_test(x_data=test_df[features].to_numpy(),
                                      y_labels=test_df[class_col].to_numpy(),
                                      features=features,
                                      test_size=test_size)
    expected_keys = ['x_train', 'y_train', 'x_test', 'y_test']
    assert all(key in return_val for key in expected_keys)
    assert all(
        isinstance(val, (pd.Series, pd.DataFrame))
        for _, val in return_val.items())
Esempio n. 2
0
def get_data(n_bins: int) -> dict:
    """Retrieve data and perform preprocessing steps and return result.

    Args:
        n_bins (int): Number of bins to discretize features into.

    Returns:
        dict: Dictionary containing test and training sets and labels.

    """
    dframe = pd.read_csv(TEST_DATA_PATH)
    obj1 = DataWrangling(data_path=TEST_DATA_PATH)
    obj1.load_data()
    features = [
        'Pglucose', 'Preg', 'Dbp', 'BMI', 'Dpfunc', 'Age', 'Insulin', 'Skin'
    ]

    input_df = obj1.data[features + ['Response']].astype(np.float64)
    no_zero_vals = obj1.fill_nul_vals(data_frame=input_df,
                                      class_column='Response',
                                      features=features)
    new_feature_list = []
    for feature in features:
        for bin_val in range(1, n_bins + 1):
            new_feature_list.append(feature + str(bin_val))
    discr_arr = obj1.discretize_features(data_frame=no_zero_vals,
                                         features=features,
                                         n_bins=n_bins)
    discr_df = pd.DataFrame(discr_arr, columns=new_feature_list)
    discr_df['Response'] = dframe['Response']
    splits = obj1.split_train_test(
        x_data=discr_df[new_feature_list].to_numpy(),
        y_labels=discr_df['Response'].to_numpy(),
        features=new_feature_list,
        test_size=0.25)
    return splits