def main(): # Set seed np.random.seed(0) # Create the training/test set(s) from file(s) train = pd.read_csv("data/all_visits_practice_2.csv") # Preliminary data diagnostics mL.describe_data(data=train, describe=True, info=True, value_counts=["ONOFF", "NP3BRADY"], description="PRELIMINARY DATA DIAGNOSTICS:") # Encode EVENT_ID to numeric mL.clean_data(data=train, encode_man={"EVENT_ID": {"SC": 0, "V04": 4, "V06": 6, "V10": 10}}) # Choose On or Off train = train[train["ONOFF"] == 0] # Remove the class with only a single sample train = train[train.NP3BRADY != 4] # Predictors for the model predictors = ["TIME_PASSED", "VISIT_NOW", "CAUDATE_R", "CAUDATE_L", "PUTAMEN_R", "PUTAMEN_L", "SCORE_NOW"] # Target for the model target = "SCORE_NEXT" # Generate new features train = generate_features(data=train, predictors=predictors, target=target, id_name="PATNO", score_name="NP3BRADY", visit_name="EVENT_ID") # Value counts for EVENT_ID after feature generation mL.describe_data(data=train, info=True, describe=True, value_counts=["VISIT_NOW", "SCORE_NEXT"], description="AFTER FEATURE GENERATION:") # Univariate feature selection mL.describe_data(data=train, univariate_feature_selection=[predictors, target]) # Algs for model algs = [RandomForestClassifier(n_estimators=1000, min_samples_split=50, min_samples_leaf=2, oob_score=True), LogisticRegression(), SVC(probability=True), GaussianNB(), MultinomialNB(), BernoulliNB(), KNeighborsClassifier(n_neighbors=25), GradientBoostingClassifier(n_estimators=10, max_depth=3)] # Alg names for model alg_names = ["Random Forest", "Logistic Regression", "SVM", "Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "kNN", "Gradient Boosting"] # Parameters for grid search grid_search_params = [{"n_estimators": [50, 500, 1000], "min_samples_split": [25, 50, 75], "min_samples_leaf": [2, 15, 25, 50]}] # Ensemble ens = mL.ensemble(algs=algs, alg_names=alg_names, ensemble_name="Weighted ensemble of RF, LR, SVM, GNB, KNN, and GB", in_ensemble=[True, True, True, True, False, False, True, True], weights=[3, 2, 1, 3, 1, 3], voting="soft") # Add ensemble to algs and alg_names algs.append(ens["alg"]) alg_names.append(ens["name"]) # Display ensemble metrics mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, feature_importances=[True], base_score=[True], oob_score=[True], cross_val=[True, True, True, True, True, True, True, True, True], split_accuracy=[True, True, True, True, True, True, True, True, True], split_classification_report=[False, False, False, False, False, False, False, False, True], split_confusion_matrix=[False, False, False, False, False, False, False, False, True])
def run(preprocess_data, cohorts, target, score_name, feature_elimination_n, gen_filename, gen_action, gen_updrs_subsets, gen_time, gen_future, gen_milestones, gen_milestone_features_values, gen_slopes, predictors_filename, predictors_action, feature_importance_n, grid_search_action, grid_search_results, print_results, results_filename, prediction_range, range_target, range_target_description, add_predictors, drop_predictors): # Initialize empty add_predictors if add_predictors is None: add_predictors = [] # Data keys data_keys = ["PATNO", "EVENT_ID", "INFODT", "PDDXDT", "SXDT", "BIRTHDT.x", "HAS_PD", target] # Target keys target_keys = [score_name] if gen_future or gen_slopes else [ x[0] for x in gen_milestone_features_values] if gen_milestones else [] # Add target keys to data keys data_keys.extend(target_keys) # TODO: Create data_preprocessing() function for all of this data preprocessing if preprocess_data: # Create the data frames from files with np.warnings.catch_warnings(): np.warnings.simplefilter("ignore") all_patients = pd.read_csv("data/all_pats.csv") all_visits = pd.read_csv("data/all_visits.csv") all_updrs = pd.read_csv("data/all_updrs.csv") # Enrolled cohorts patients pd_control_patients = all_patients.loc[ (np.bitwise_or.reduce(np.array([(all_patients["APPRDX"] == cohort) for cohort in cohorts]))) & ( all_patients["ENROLL_STATUS"] == "Enrolled"), "PATNO"].unique() # Data for these patients pd_control_data = all_visits[all_visits["PATNO"].isin(pd_control_patients)].merge( all_updrs[["PATNO", "EVENT_ID", "TOTAL"]], on=["PATNO", "EVENT_ID"], how="left").merge( all_patients, on="PATNO", how="left", suffixes=["_x", ""]) # Only include "off" data pd_control_data = pd_control_data[pd_control_data["PAG_UPDRS3"] == "NUPDRS3"] # # Merge SC data onto BL data # sc_bl_merge = pd_control_data[pd_control_data["EVENT_ID"] == "BL"].merge( # pd_control_data[pd_control_data["EVENT_ID"] == "SC"], on="PATNO", how="left", suffixes=["", "_SC_ID"]) # # # Remove SC data that already belongs to BL # pd_control_data.loc[pd_control_data["EVENT_ID"] == "BL"] = sc_bl_merge.drop( # [col for col in sc_bl_merge.columns if col[-6:] == "_SC_ID"], axis=1).values # # Initiate progress # prog = Progress(0, len(pd_control_data["PATNO"].unique()), "Merging Screening Into Baseline", print_results) # # # Use SC data where BL is null # for patient in pd_control_data["PATNO"].unique(): # if not pd_control_data[(pd_control_data["PATNO"] == patient) & (pd_control_data["EVENT_ID"] == "SC")].empty: # for column in pd_control_data.keys(): # if (pd_control_data.loc[(pd_control_data["PATNO"] == patient) & ( # pd_control_data["EVENT_ID"] == "BL"), column].isnull().values.all()) and ( # pd_control_data.loc[(pd_control_data["PATNO"] == patient) & ( # pd_control_data["EVENT_ID"] == "SC"), column].notnull().values.any()): # pd_control_data.loc[ # (pd_control_data["PATNO"] == patient) & (pd_control_data["EVENT_ID"] == "BL"), column] = \ # max(pd_control_data.loc[ # (pd_control_data["PATNO"] == patient) & ( # pd_control_data["EVENT_ID"] == "SC"), column].tolist()) # # Update progress # prog.update_progress() # Remove SC rows pd_control_data = pd_control_data[pd_control_data["EVENT_ID"] != "SC"] # Drop duplicates based on PATNO and EVENT_ID, keep only first pd_control_data = pd_control_data.drop_duplicates(subset=["PATNO", "EVENT_ID"], keep="first") # Encode to numeric mL.clean_data(data=pd_control_data, encode_auto=["HANDED", "PAG_UPDRS3"], encode_man={ "EVENT_ID": {"BL": 0, "V01": 1, "V02": 2, "V03": 3, "V04": 4, "V05": 5, "V06": 6, "V07": 7, "V08": 8, "V09": 9, "V10": 10, "V11": 11, "V12": 12}}) # Create HAS_PD column pd_control_data["HAS_PD"] = 0 pd_control_data.loc[(pd_control_data["APPRDX"] == "PD") | (pd_control_data["APPRDX"] == "GRPD") | ( pd_control_data["APPRDX"] == "GCPD"), "HAS_PD"] = 1 # Convert remaining categorical data to binary columns numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] dummy_features = [item for item in pd_control_data.columns.values if item not in list( pd_control_data.select_dtypes(include=numerics).columns.values) + drop_predictors] pd_control_data = pd.get_dummies(pd_control_data, columns=dummy_features) # Controls have missing PDDXDT and SXDT, set to arbitrary date pd_control_data.loc[pd_control_data["HAS_PD"] == 0, "PDDXDT"] = pd.to_datetime("1/1/1800") pd_control_data.loc[pd_control_data["HAS_PD"] == 0, "SXDT"] = pd.to_datetime("1/1/1800") pd_control_data.to_csv("data/PPMI_Clean_Data.csv", index=False) else: # Use preprocessed data pd_control_data = pd.read_csv("data/PPMI_Clean_Data.csv") # Convert to correct dtypes pd_control_data[["PATNO", "EVENT_ID"]] = pd_control_data[["PATNO", "EVENT_ID"]].apply(pd.to_numeric, errors="coerce") if predictors_action: if print_results: print("Optimizing Predictors . . .") # Drop unused columns for column in pd_control_data.keys(): if (column in drop_predictors) and (column not in data_keys): pd_control_data = pd_control_data.drop(column, 1) else: # Drop unused columns pd_control_data = pd_control_data[list( set(add_predictors + data_keys) & set( pd_control_data.columns.values.tolist()))] if print_results: # Print number patients and features before feature elimination print("BEFORE FEATURE ELIMINATION: Patients: {}, Features: {}".format( len(pd_control_data[pd_control_data["EVENT_ID"] == 0]), len(pd_control_data.keys()))) pd_control_data.to_csv("TEST.csv") # Perform optimal feature elimination if feature_elimination_n is None: feature_elimination_n = max([x / 1000 for x in range(25, 1000, 25)], key=lambda n: feature_row_selection(pd_control_data, n, data_keys, target_keys, True, True)) if print_results: print("\rFeature Elimination N: {}\n".format(feature_elimination_n)) # Feature/row elimination pd_control_data = feature_row_selection(pd_control_data, feature_elimination_n, data_keys, target_keys) if (not predictors_action) and print_results: # Print number patients and features after feature elimination print("AFTER FEATURE ELIMINATION: Patients: {}, Features: {}".format( len(pd_control_data[pd_control_data["EVENT_ID"] == 0]), len(pd_control_data.keys()))) # Select all features in the data set all_data_features = list(pd_control_data.columns.values) pd_control_data.to_csv("testttttt.csv") # Generate features (and update all features list) train = generate_features(data=pd_control_data, features=all_data_features, filename=gen_filename, action=gen_action, updrs_subsets=gen_updrs_subsets, time=gen_time, future=gen_future, milestones=gen_milestones, slopes=gen_slopes, score_name=score_name, milestone_features_values=gen_milestone_features_values, progress=(not predictors_action) and print_results) if (not predictors_action) and print_results: # Data diagnostics after feature generation mL.describe_data(data=train, describe=True, description="AFTER FEATURE GENERATION:") # Parameters for grid search grid_search_params = [{"n_estimators": [50, 150, 300, 500, 750, 1000], "min_samples_split": [4, 8, 25, 50, 75, 100], "min_samples_leaf": [2, 8, 15, 25, 50, 75, 100]}] # Algs for model # Grid search (futures): n_estimators=50, min_samples_split=75, min_samples_leaf=50 # Futures: n_estimators=150, min_samples_split=100, min_samples_leaf=25 # Grid search (slopes): 'min_samples_split': 75, 'n_estimators': 50, 'min_samples_leaf': 25 # Futures: 'min_samples_leaf': 100, 'min_samples_split': 25, 'n_estimators': 50 # Newest Futures: {'n_estimators': 500, 'min_samples_leaf': 2, 'min_samples_split': 4} # TRMR: {'n_estimators': 150, 'min_samples_leaf': 2, 'min_samples_split': 8} # Slopes: {'n_estimators': 500, 'min_samples_split': 25, 'min_samples_leaf': 2} algs = [ RandomForestRegressor(n_estimators=500, min_samples_split=4, min_samples_leaf=2, oob_score=True) if target != "SCORE_SLOPE" else RandomForestClassifier(n_estimators=500, min_samples_split=25, min_samples_leaf=2, oob_score=True), LogisticRegression(), SVC(probability=True), GaussianNB(), MultinomialNB(), BernoulliNB(), KNeighborsClassifier(n_neighbors=25), GradientBoostingClassifier(n_estimators=10, max_depth=3)] # Alg names for model alg_names = ["Random Forest", "Logistic Regression", "SVM", "Gaussian Naive Bayes", "Multinomial Naive Bayes", "Bernoulli Naive Bayes", "kNN", "Gradient Boosting"] # TODO: Configure ensemble # Ensemble ens = mL.ensemble(algs=algs, alg_names=alg_names, ensemble_name="Weighted ensemble of RF, LR, SVM, GNB, KNN, and GB", in_ensemble=[True, True, True, True, False, False, True, True], weights=[3, 2, 1, 3, 1, 3], voting="soft") # Add ensemble to algs and alg_names # algs.append(ens["alg"]) # alg_names.append(ens["name"]) if predictors_action: # Initialize predictors as all numeric features numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] predictors = list(train.select_dtypes(include=numerics).columns.values) # Drop unwanted features from predictors list for feature in drop_predictors: if feature in predictors: predictors.remove(feature) # If grid search action, use grid search estimator if grid_search_action: algs[0] = mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, scoring="r2" if target != "SCORE_SLOPE" else "accuracy", grid_search_params=grid_search_params, output=True)["Grid Search Random Forest"].best_estimator_ train[predictors + ["PATNO"]].to_csv("test_yay_delete.csv") # Get feature importances feature_importances = mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, feature_importances=[True], output=True, description=None)["Feature Importances Random Forest"] # Set important features as predictors predictors = [x for x, y in feature_importances if y >= feature_importance_n] # Use predictors plus added predictors add_predictors.extend(predictors) # Output predictors to file pd.DataFrame({"predictors": predictors}).to_csv(predictors_filename, index=False) # Run with new predictors run(False, cohorts, target, score_name, feature_elimination_n, gen_filename, gen_action, gen_updrs_subsets, gen_time, gen_future, gen_milestones, gen_milestone_features_values, gen_slopes, predictors_filename, False, feature_importance_n, grid_search_action, grid_search_results, print_results, results_filename, prediction_range, range_target, range_target_description, add_predictors, drop_predictors) else: # Get predictors from file predictors = add_predictors # Create file of training data train[predictors].to_csv("data/PPMI_train.csv") # Grid search if grid_search_action or grid_search_results: # Compute grid search grid_search = mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, scoring="r2" if target != "SCORE_SLOPE" else "accuracy", grid_search_params=grid_search_params, output=True) # If grid search action, use grid search estimator if grid_search_action: algs[0] = grid_search["Grid Search Random Forest"].best_estimator_ # Univariate feature selection # mL.describe_data(data=train, univariate_feature_selection=[predictors, target]) # Display metrics, including r2 score metrics = mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, feature_importances=[True], base_score=[True], oob_score=[True], cross_val=[True], scoring="r2", output=not print_results) # feature_dictionary=[data_dictionary, "FEATURE", "DSCR"]) # Display mean absolute error score metrics.update(mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, cross_val=[True], scoring="mean_absolute_error", description=None, output=not print_results)) # Display root mean squared error score metrics.update(mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, cross_val=[True], scoring="root_mean_squared_error", description=None, output=not print_results)) metrics["Cross Validation accuracy Random Forest"] = None # Metrics for classification if target == "SCORE_SLOPE": # Display classification accuracy metrics.update(mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, cross_val=[True], scoring="accuracy", description=None, output=not print_results)) # Display confusion matrix mL.metrics(data=train, predictors=predictors, target=target, algs=algs, alg_names=alg_names, split_confusion_matrix=[True], description=None, output=not print_results) # If grid search results, print results if grid_search_results: print(grid_search["Grid Search String Random Forest"]) if not print_results: # Write results to file results = pd.DataFrame( columns=[prediction_range, "description", "base", "oob", "r2", "mes", "rmse", "accuracy", "features", "importances"]) results.loc[0, prediction_range] = range_target results.loc[0, "description"] = range_target_description results.loc[0, "base"] = metrics["Base Score Random Forest"] results.loc[0, "oob"] = metrics["OOB Score Random Forest"] results.loc[0, "r2"] = metrics["Cross Validation r2 Random Forest"] results.loc[0, "mes"] = metrics["Cross Validation mean_absolute_error Random Forest"] results.loc[0, "rmse"] = metrics["Cross Validation root_mean_squared_error Random Forest"] results.loc[0, "accuracy"] = metrics["Cross Validation accuracy Random Forest"] feature_importances = list(metrics["Feature Importances Random Forest"]) results.loc[0, "features"] = feature_importances[0][0] results.loc[0, "importances"] = feature_importances[0][1] for feature, importance in feature_importances[1:]: index = results.index.max() + 1 results.loc[index, "features"] = feature results.loc[index, "importances"] = importance results.to_csv(results_filename, mode="a", header=False, index=False)