def test_cross_validate_n_predictors():

    data_path = "./formulation/data/"
    data_fname = 'FDA_APPROVED.csv'

    # Read csv file
    df = pd.read_csv(data_path + data_fname)
    print(df.tail(3))

    # Dropnan
    columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation']
    df = data_dropna(df, needed_cols=columns, subset=columns)
    print(df.tail(10))

    # Print count of each category
    print(df.groupby('Formulation').count())

    # Prepare predictors and response variable
    features = ['CLogP', 'HBA', 'HBD', 'PSDA']
    X_df = df[features]

    target = ['Formulation']
    y_df = df[target]

    max_depth = 2
    n_estimators = 100

    best_p = cross_validate_n_predictors(X_df, y_df, max_depth, n_estimators)

    assert isinstance(best_p, int)
    assert best_p <= X_df.shape[1]
def test_cross_validate_grid_search():

    data_path = "./formulation/data/"
    data_fname = 'FDA_APPROVED.csv'

    # Read csv file
    df = pd.read_csv(data_path + data_fname)
    print(df.tail(3))

    # Dropnan
    columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation']
    df = data_dropna(df, needed_cols=columns, subset=columns)
    print(df.tail(10))

    # Print count of each category
    print(df.groupby('Formulation').count())

    # Prepare predictors and response variable
    features = ['CLogP', 'HBA', 'HBD', 'PSDA']
    X_df = df[features]

    target = ['Formulation']
    y_df = df[target]

    max_depth = range(1, 5)
    ntrees = range(1, 200, 50)

    values = [max_depth, ntrees]
    results = cross_validate_grid_search(values, X_df, y_df)

    assert len(results) == 4
    assert len(results[0]) == 2
    assert len(results[1]) == 2
    assert len(results[2]) == 2
    assert len(results[3]) == 2
    m = len(results)
    n = len(results[0])
    for i in range(m):
        for j in range(n):
            assert isinstance(results[i][j], int)
def test_predict():
    data_path = "./formulation/data/"
    data_fname = 'FDA_APPROVED.csv'

    # Read csv file
    df = pd.read_csv(data_path + data_fname)
    print(df.tail(3))

    # Dropnan
    columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation']
    df = data_dropna(df, needed_cols=columns, subset=columns)
    print(df.tail(10))

    # Print count of each category
    print(df.groupby('Formulation').count())

    # Prepare predictors and response variable
    features = ['CLogP', 'HBA', 'HBD', 'PSDA']
    X_df = df[features]

    target = ['Formulation']
    y_df = df[target]

    X_trn, X_tst, y_trn, y_tst = train_test_split(X_df.values,
                                                  y_df.values,
                                                  test_size=0.20,
                                                  random_state=42)

    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf.fit(X_trn, y_trn.flatten())
    y_pred = predict(clf, X_tst)

    assert isinstance(y_pred, np.ndarray)
    assert y_pred.shape[0] == y_tst.shape[0]

    X_tst_df = pd.DataFrame(X_tst)
    y_pred2 = predict(clf, X_tst_df)
    assert isinstance(y_pred2, np.ndarray)
    assert y_pred2.shape[0] == y_tst.shape[0]
Beispiel #4
0
def test_data_dropna():
    """
    Test case for `data_dropna`
    """

    # test handling illegal inputs
    try:
        # `data` should be a pd.DataFrame
        data_dropna([1, 2, None])
        raise Exception("Illegal input test failed!")
    except AssertionError:
        pass

    try:
        # `subset` contains elements that are not in `needed_cols`
        data_dropna(DATA, INPUTS, NEEDED)
        raise Exception("Illegal input test failed!")
    except AssertionError:
        pass

    all_dropped = data_dropna(DATA)
    inputs_dropped = data_dropna(DATA, NEEDED, INPUTS)
    needed_dropped = data_dropna(DATA, NEEDED, NEEDED)

    assert isinstance(all_dropped, pd.DataFrame),\
        "Type error. pd.DataFrame expected"
    assert isinstance(inputs_dropped, pd.DataFrame),\
        "Type error. pd.DataFrame expected"
    assert isinstance(needed_dropped, pd.DataFrame),\
        "Type error. pd.DataFrame expected"

    assert True not in all_dropped.isna().to_numpy(),\
        "Error. Missing value not dropped"
    assert True not in inputs_dropped[INPUTS].isna().to_numpy(),\
        "Error. Missing value not dropped"
    assert True not in needed_dropped.isna().to_numpy(),\
        "Error. Missing value not dropped"
Beispiel #5
0
    return best_p


if __name__ == '__main__':

    data_path = "./formulation/data/"
    data_fname = 'FDA_APPROVED.csv'

    # Read csv file
    df = pd.read_csv(data_path + data_fname)
    print(df.tail(3))

    # Dropnan
    columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation']
    df = data_dropna(df, needed_cols=columns, subset=columns)
    print(df.tail(10))

    # Print count of each category
    print(df.groupby('Formulation').count())

    # Prepare predictors and response variable
    features = ['CLogP', 'HBA', 'HBD', 'PSDA']
    X = df[features]

    target = ['Formulation']
    y = df[target]

    max_depth = range(1, 5)
    ntrees = range(1, 200, 50)