Exemple #1
0
def test_output_data_frame():
    '''
    Test that output data frame has expected values when the original data is stored in a data frame
    '''
    df1 = pd.DataFrame(
        [[3, 2, np.nan, 1], [3, 6, 2, np.nan], [3, np.nan, np.nan, 8]],
        columns=list('ABCD'))
    adf1 = impute_missing(df1, 'B', "CC", np.nan)
    assert adf1.equals(
        pd.DataFrame([[3, 2.0, np.nan, 1], [3, 6.0, 2, np.nan]],
                     columns=list('ABCD')))

    df2 = pd.DataFrame([[3, 2, "", 1], [3, 4, 2, ""], [3, "", "", 8]],
                       columns=list('ABCD'))
    adf2 = impute_missing(df2, 'B', "CC", "")
    assert adf2.equals(
        pd.DataFrame([[3, 2.0, "", 1], [3, 4.0, 2, ""]], columns=list('ABCD')))

    df3 = pd.DataFrame([[3, 2, "?", 1], [3, 4, 2, "?"], [3, "?", "?", 8]],
                       columns=list('ABCD'))
    adf3 = impute_missing(df3, 'B', "CC", "?")
    assert adf3.equals(
        pd.DataFrame([[3, 2.0, "?", 1], [3, 4.0, 2, "?"]],
                     columns=list('ABCD')))

    adf4 = impute_missing(df1, 'B', "MIP", np.nan)
    assert adf4.equals(
        pd.DataFrame([[3, 2.0, np.nan, 1.0], [3, 6.0, 2.0, np.nan],
                      [3, 4.0, np.nan, 8.0]],
                     columns=list('ABCD')))

    adf5 = impute_missing(df2, 'B', "MIP", "")
    assert adf5.equals(
        pd.DataFrame([[3, 2.0, "", 1.0], [3, 4.0, 2.0, ""], [3, 3.0, "", 8.0]],
                     columns=list('ABCD')))

    adf6 = impute_missing(df3, 'B', "MIP", "?")
    assert adf6.equals(
        pd.DataFrame(
            [[3, 2.0, "?", 1.0], [3, 4.0, 2.0, "?"], [3, 3.0, "?", 8.0]],
            columns=list('ABCD')))

    adf7 = impute_missing(df1, 'B', "DIP", np.nan)
    assert adf7.equals(
        pd.DataFrame(
            [[3, 2.0, np.nan, 1], [3, 6.0, 2, np.nan], [3, 4.0, np.nan, 8]],
            columns=list('ABCD')))

    adf8 = impute_missing(df2, 'B', "DIP", "")
    assert adf8.equals(
        pd.DataFrame([[3, 2.0, "", 1], [3, 4.0, 2, ""], [3, 3.0, "", 8]],
                     columns=list('ABCD')))

    adf9 = impute_missing(df3, 'B', "DIP", "?")
    assert adf9.equals(
        pd.DataFrame([[3, 2.0, "?", 1], [3, 4.0, 2, "?"], [3, 3.0, "?", 8]],
                     columns=list('ABCD')))
Exemple #2
0
def test_output_matrix():
    '''
    Test that output data frame has expected values when the original data is stored in a matrix
    '''
    dfm1 = impute_missing(np.matrix([[1, 2], [3, np.nan]]), 'b', "CC", np.nan)
    assert dfm1.equals(pd.DataFrame([[1.0, 2.0]], columns=list('ab')))

    dfm2 = impute_missing(np.matrix([[1, 2], [3, ""]]), 'b', "MIP", "")
    dfm2['a'] = pd.to_numeric(dfm2['a'])
    assert dfm2.equals(pd.DataFrame([[1, 2.0], [3, 2.0]], columns=list('ab')))

    dfm3 = impute_missing(np.matrix([[1, 2], [3, "?"]]), 'b', "DIP", "?")
    dfm3['a'] = pd.to_numeric(dfm2['a'])
    assert dfm3.equals(pd.DataFrame([[1, 2.0], [3, 2.0]], columns=list('ab')))
Exemple #3
0
def test_input_types():
    '''
    Check input types of the function
    '''
    with pytest.raises(TypeError):  # column name must be a string
        impute_missing(
            pd.DataFrame([[np.nan, 2, 1], [3, np.nan, 1], [np.nan, np.nan, 5]],
                         columns=list('abc')), 2, "DIP", np.NaN)

    with pytest.raises(
            TypeError):  # the specified column name is not in the data frame
        impute_missing(
            pd.DataFrame([[np.nan, 2, 1], [3, np.nan, 1], [np.nan, np.nan, 5]],
                         columns=list('abc')), "d", "CC", np.nan)

    with pytest.raises(TypeError):  # method is not applicable
        impute_missing(
            pd.DataFrame([[np.nan, 2, 1], [3, np.nan, 1], [np.nan, np.nan, 5]],
                         columns=list('abc')), "b", "multi", np.nan)

    with pytest.raises(
            TypeError
    ):  # missing value format is not supported, expected one of a blank space, a question mark and np.NaN, np.nan, np.NAN
        impute_missing(
            pd.DataFrame([[0, 2, 1], [3, 0, 1], [0, 0, 5]],
                         columns=list('abc')), "b", "MIP", 0)
Exemple #4
0
def test_output_type():
    '''
    Test that output type is a dataframe
    '''
    assert isinstance(
        impute_missing(np.matrix([[1, 2], [3, np.nan], [5, 6]]), 'b', "CC",
                       np.nan), pd.DataFrame)
Exemple #5
0
def no_change():
    """
    Test if there is no change to the dataframe after imputation.
    """
    meds = ["CC", "MIP"]
    feature = 'col1'
    result = compare_model(df, feature, methods=meds, missing_val_char=np.nan)
    a = df[feature].describe()
    test = pd.DataFrame(data=a)

    for method in meds:
        df_after = impute_missing(df, feature, method, "NaN")
        b = df_after[feature].describe()
        b = pd.DataFrame(data=b)
        name = feature + '_after_' + method
        if not pd.DataFrame(data=a).equals(b):
            raise ValueError(
                "The data information does not change after imputation")

        test[name] = b[feature]
Exemple #6
0
def test_compare():
    """
      Unit test for the `compare_model()` function
      It will create a datafame comparing all the statistical information of the dataframe before and after imputation
      and between several methods of imputation, then compare it with the result of compare_model()
      Return error message if the two results are not the same.
    """
    meds = ("CC", "MIP")
    feature = 'col1'
    a = df2[feature].describe()
    test = pd.DataFrame(data=a)

    for method in meds:
        df_after = impute_missing(df2, feature, method, np.nan)
        b = df_after[feature].describe()
        b = pd.DataFrame(data=b)
        name = feature + '_after_' + method
        test[name] = b[feature]

    if not isinstance(test, pd.DataFrame):
        raise TypeError("Output type must be a dataframe")

    if not test.equals(test):
        raise ValueError("The result has some problem")
Exemple #7
0
def test_output_values():
    '''
    test that the specified column of the output data frame has no missing values
    '''
    assert not impute_missing(np.matrix([[1, 2], [3, np.nan]]), 'b', "CC",
                              np.nan)['b'].isnull().any()