Esempio n. 1
0
def test_non_accepted_cat():
    try:
        fill_missing(X_train,
                     X_test,
                     col_dict,
                     num_imp="median",
                     cat_imp="mean")

    except AssertionError:
        pass
Esempio n. 2
0
def test_bad_col_dict():
    try:
        fill_missing(X_train,
                     X_test,
                     column_dict=["cat1", "num1"],
                     num_imp="median",
                     cat_imp="mode")

    except AssertionError:
        pass
Esempio n. 3
0
def test_bad_dict_keys():
    try:
        fill_missing(
            X_train,
            X_test,
            # insert wrong column key
            column_dict={
                'numerical': ['num1'],
                'categorical': ['cat1']
            },
            num_imp="median",
            cat_imp="mean")

    except AssertionError:
        pass
Esempio n. 4
0
def test_no_name_columns():
    try:
        fill_missing(
            X_train=pd.DataFrame([1.5, 2.5, 3.5, None, 4.5]),
            X_test=pd.DataFrame([1, None, 3, 1, 3]),
            # insert column which is not in the df
            column_dict={
                'numeric': ['num2'],
                'categorical': ['cat1']
            },
            num_imp="median",
            cat_imp="mean")

    except AssertionError:
        pass
Esempio n. 5
0
def test_bad_test_set():
    try:
        fill_missing(X_train=pd.DataFrame({
            'cat1': [1, None, 3, 1, 3],
            'num1': [1.5, 2.5, None, 2.0, 2.0]
        }),
                     X_test=np.array(1),
                     column_dict={
                         'numeric': ['num1'],
                         'categorical': ['cat1']
                     },
                     num_imp="median",
                     cat_imp="mode")

    except AssertionError:
        pass
Esempio n. 6
0
def run_pylaundry():
    """
    Runs all modules of pylaundry
    Arguments
    --------
    NA
    Returns
    ------
    features_selected = list of final features selected
    """
    col_dict = categorize(df=X_train)
    # second function - fill_missing
    clean_data = fill_missing(X_train,
                              X_test,
                              col_dict,
                              num_imp="mean",
                              cat_imp="mode")
    # third function - transform_columns
    transformed_data = transform_columns(clean_data['X_train'],
                                         clean_data['X_test'], col_dict)
    # fourth function - feature selection
    features_selected = select_features(transformed_data['X_train'],
                                        y_train,
                                        n_features=2)
    return features_selected
Esempio n. 7
0
def test_diff_columns():
    try:
        fill_missing(X_train=pd.DataFrame({
            'cat1': [1, 2, None, 1, 1],
            'num2': [1.5, 2.5, 3.5, None, 4.5]
        }),
                     X_test=pd.DataFrame({
                         'cat1': [1, None, 3, 1, 3],
                         'num1': [1.5, 2.5, None, 2.0, 2.0]
                     }),
                     column_dict={
                         'numeric': ['num1'],
                         'categorical': ['cat1']
                     },
                     num_imp="median",
                     cat_imp="mean")

    except AssertionError:
        pass
Esempio n. 8
0
def test_mode_cat():
    output = fill_missing(X_train,
                          X_test,
                          col_dict,
                          num_imp="median",
                          cat_imp="mode")
    train_output = output['X_train']
    test_output = output['X_test']

    # Mean column 1 is 3 for imputed value
    # Check for same imputation train and test
    assert train_output["cat1"][2] == 1, \
        "Imputed mode value should be 1 in train set"
    assert test_output["cat1"][1] == 1, \
        "Imputed mode value should be 1 in test set"
Esempio n. 9
0
def test_mean_num():
    output = fill_missing(X_train,
                          X_test,
                          col_dict,
                          num_imp="mean",
                          cat_imp="mode")
    train_output = output['X_train']
    test_output = output['X_test']

    # Mean column 1 is 3 for imputed value
    # Check for same imputation train and test
    assert train_output["num1"][3] == 3, \
        "Imputed mean value should be 3.0 in train set"
    assert test_output["num1"][2] == 3, \
        "Imputed mean value should be 3.0 in test set"
Esempio n. 10
0
def test_output_type():
    output = fill_missing(X_train,
                          X_test,
                          col_dict,
                          num_imp="mean",
                          cat_imp="mode")
    train_output = output['X_train']
    test_output = output['X_test']
    # Check length and type of output
    assert len(output) == 2, \
        "Output of fill_missing() should be two dataframes"
    assert isinstance(train_output, pd.DataFrame), \
        "Training df should be a Pandas DF"
    assert isinstance(test_output, pd.DataFrame), \
        "Test df should be a Pandas DF"
Esempio n. 11
0
def test_non_numeric_columns():
    output = fill_missing(X_train=pd.DataFrame({
        'cat1': ['a', 'b', None, 'c', 'a'],
        'num1': [1.5, 2.5, 3.5, None, 4.5]
    }),
                          X_test=pd.DataFrame({
                              'cat1': ['a', 'b', None],
                              'num1': [1.5, None, 3.5],
                          }),
                          column_dict=col_dict,
                          num_imp="mean",
                          cat_imp="mode")
    train_output = output['X_train']
    test_output = output['X_test']

    # Mode of column 1 is 3 for imputed value
    # Check for same imputation train and test
    assert train_output["cat1"][2] == 'a', \
        "Imputed mode value should be 'a' in train set"
    assert test_output["cat1"][2] == 'a', \
        "Imputed mode value should be 'a' in test set"
Esempio n. 12
0
def test_median_num():
    output = fill_missing(X_train=pd.DataFrame({
        'cat1': [1, 2, None, 1, 1],
        'num1': [1.5, 2.5, 3.5, None, 4.5]
    }),
                          X_test=pd.DataFrame({
                              'cat1': [1, None, 3, 1, 3],
                              'num1': [1.5, 2.5, None, 2.0, 2.0]
                          }),
                          column_dict={
                              'numeric': ['num1'],
                              'categorical': ['cat1']
                          },
                          num_imp="median",
                          cat_imp="mode")
    train_output = output['X_train']
    test_output = output['X_test']

    # Mean column 1 is 3 for imputed value
    # Check for same imputation train and test
    assert train_output["num1"][3] == 3, \
        "Imputed median value should be 3.0 in train set"
    assert test_output["num1"][2] == 3, \
        "Imputed median value should be 3.0 in test set"