def test_cross_validate_n_predictors(): data_path = "./formulation/data/" data_fname = 'FDA_APPROVED.csv' # Read csv file df = pd.read_csv(data_path + data_fname) print(df.tail(3)) # Dropnan columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation'] df = data_dropna(df, needed_cols=columns, subset=columns) print(df.tail(10)) # Print count of each category print(df.groupby('Formulation').count()) # Prepare predictors and response variable features = ['CLogP', 'HBA', 'HBD', 'PSDA'] X_df = df[features] target = ['Formulation'] y_df = df[target] max_depth = 2 n_estimators = 100 best_p = cross_validate_n_predictors(X_df, y_df, max_depth, n_estimators) assert isinstance(best_p, int) assert best_p <= X_df.shape[1]
def test_cross_validate_grid_search(): data_path = "./formulation/data/" data_fname = 'FDA_APPROVED.csv' # Read csv file df = pd.read_csv(data_path + data_fname) print(df.tail(3)) # Dropnan columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation'] df = data_dropna(df, needed_cols=columns, subset=columns) print(df.tail(10)) # Print count of each category print(df.groupby('Formulation').count()) # Prepare predictors and response variable features = ['CLogP', 'HBA', 'HBD', 'PSDA'] X_df = df[features] target = ['Formulation'] y_df = df[target] max_depth = range(1, 5) ntrees = range(1, 200, 50) values = [max_depth, ntrees] results = cross_validate_grid_search(values, X_df, y_df) assert len(results) == 4 assert len(results[0]) == 2 assert len(results[1]) == 2 assert len(results[2]) == 2 assert len(results[3]) == 2 m = len(results) n = len(results[0]) for i in range(m): for j in range(n): assert isinstance(results[i][j], int)
def test_predict(): data_path = "./formulation/data/" data_fname = 'FDA_APPROVED.csv' # Read csv file df = pd.read_csv(data_path + data_fname) print(df.tail(3)) # Dropnan columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation'] df = data_dropna(df, needed_cols=columns, subset=columns) print(df.tail(10)) # Print count of each category print(df.groupby('Formulation').count()) # Prepare predictors and response variable features = ['CLogP', 'HBA', 'HBD', 'PSDA'] X_df = df[features] target = ['Formulation'] y_df = df[target] X_trn, X_tst, y_trn, y_tst = train_test_split(X_df.values, y_df.values, test_size=0.20, random_state=42) clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) clf.fit(X_trn, y_trn.flatten()) y_pred = predict(clf, X_tst) assert isinstance(y_pred, np.ndarray) assert y_pred.shape[0] == y_tst.shape[0] X_tst_df = pd.DataFrame(X_tst) y_pred2 = predict(clf, X_tst_df) assert isinstance(y_pred2, np.ndarray) assert y_pred2.shape[0] == y_tst.shape[0]
def test_data_dropna(): """ Test case for `data_dropna` """ # test handling illegal inputs try: # `data` should be a pd.DataFrame data_dropna([1, 2, None]) raise Exception("Illegal input test failed!") except AssertionError: pass try: # `subset` contains elements that are not in `needed_cols` data_dropna(DATA, INPUTS, NEEDED) raise Exception("Illegal input test failed!") except AssertionError: pass all_dropped = data_dropna(DATA) inputs_dropped = data_dropna(DATA, NEEDED, INPUTS) needed_dropped = data_dropna(DATA, NEEDED, NEEDED) assert isinstance(all_dropped, pd.DataFrame),\ "Type error. pd.DataFrame expected" assert isinstance(inputs_dropped, pd.DataFrame),\ "Type error. pd.DataFrame expected" assert isinstance(needed_dropped, pd.DataFrame),\ "Type error. pd.DataFrame expected" assert True not in all_dropped.isna().to_numpy(),\ "Error. Missing value not dropped" assert True not in inputs_dropped[INPUTS].isna().to_numpy(),\ "Error. Missing value not dropped" assert True not in needed_dropped.isna().to_numpy(),\ "Error. Missing value not dropped"
return best_p if __name__ == '__main__': data_path = "./formulation/data/" data_fname = 'FDA_APPROVED.csv' # Read csv file df = pd.read_csv(data_path + data_fname) print(df.tail(3)) # Dropnan columns = ['CLogP', 'HBA', 'HBD', 'PSDA', 'Formulation'] df = data_dropna(df, needed_cols=columns, subset=columns) print(df.tail(10)) # Print count of each category print(df.groupby('Formulation').count()) # Prepare predictors and response variable features = ['CLogP', 'HBA', 'HBD', 'PSDA'] X = df[features] target = ['Formulation'] y = df[target] max_depth = range(1, 5) ntrees = range(1, 200, 50)