def write_nrmodel_data(df, model_group, launch_set_id, cause_id=None): """Write input data for the noisereduction model.""" nr_dir = CONF.get_directory('nr_process_data') if cause_id is not None: outdir = "".format( nrdir=nr_dir, model_group=model_group, cause_id=cause_id ) else: outdir = "FILEPATH".format( nrdir=nr_dir, model_group=model_group ) makedirs_safely(outdir) # tests all use launch set id = 0, so any existing stuff should be # deleted if launch_set_id == 0: del_path = "FILEPATH".format(outdir, launch_set_id) if os.path.exists(del_path): os.unlink(del_path) del_path = "FILEPATH".format(outdir, launch_set_id) if os.path.exists(del_path): os.unlink(del_path) if cause_id is not None: write_df = df.loc[df['cause_id'] == cause_id] else: write_df = df write_df.to_csv( "FILEPATH".format(outdir, launch_set_id), index=False)
def get_limited_use_directory(source, int_cause, inj_garbage): """Different input directories for limited use vs. non-limited use data.""" limited_use = "/ihme/limited_use" thesis = f"mcod/{int_cause}" limited_use_paths = { "TWN_MOH": "LIMITED_USE/PROJECT_FOLDERS/GBD/TWN/VR/", "MEX_INEGI": "IDENT/PROJECT_FOLDERS/MEX/MULTIPLE_CAUSES_OF_DEATH_INEGI/", "BRA_SIM": "LIMITED_USE/PROJECT_FOLDERS/BRA/GBD_FROM_COLLABORATORS/SIM/", "USA_NVSS": "LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_1989_2016_Y2019M02D27/1989_2016_CUSTOM_MORTALITY/" } if source in limited_use_paths.keys(): limited_dir = os.path.join(limited_use, limited_use_paths[source], thesis) if inj_garbage: limited_dir = os.path.join(limited_dir, "inj_garbage") else: print_log_message(f"not using limited use directory for {source}") makedirs_safely(limited_dir) return limited_dir
def plot_figure(df, int_cause, x_axis="max_depth", y_axis="mean_test_concordance"): plot = sns.FacetGrid(df, row="criterion", col="n_estimators") plot.map(plt.scatter, x_axis, y_axis) plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.ylabel(y_axis) plt.xlabel(x_axis) makedirs_safely(f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/") plt.savefig(f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/{int_cause}_rf_{DATE}.pdf")
def launch_training_models(self, model_name, short_name, model_param, age_group_id=None): if age_group_id: write_dir = f"{self.model_dir}/{age_group_id}/{short_name}/model_{model_param}" jobname = f"{short_name}_{self.int_cause}_{model_param}_{age_group_id}" model_dir = f"{self.model_dir}/{age_group_id}" else: write_dir = f"{self.model_dir}/{short_name}/model_{model_param}" jobname = f"{short_name}_{self.icd_features}_{self.int_cause}_{model_param}" model_dir = self.model_dir memory_dict = { "rf": 150, "multi_nb": 20, "bernoulli_nb": 20, "complement_nb": 20, "gbt": 30, "xgb": 40, "svm": 40, "svm_bag": 20, "nn": 350 } makedirs_safely(write_dir) # remove previous model runs remove_if_output_exists(write_dir, "grid_results.pkl") remove_if_output_exists(write_dir, "summary_stats.csv") params = [ write_dir, model_dir, model_param, model_name, short_name, self.int_cause, self.age_feature, self.dem_feature ] worker = f"/homes/agesak/thesis/analysis/run_models.py" memory = memory_dict[short_name] if (self.int_cause == "y34") & (short_name == "rf"): memory = "250" submit_mcod(jobname, "python", worker, cores=4, memory=f"{memory}G", params=params, verbose=True, logging=True, jdrive=False, queue="long.q", runtime=ModelLauncher.runtime_dict[short_name])
def plot_figure(df, int_cause, x_axis="learning_rate", y_axis="mean_test_concordance", maxdepth=None, nestimators=None): plot = sns.FacetGrid(df, row="gamma", col="subsample") plot.map(plt.scatter, x_axis, y_axis) plt.subplots_adjust(hspace=0.4, wspace=0.4) plt.ylabel(y_axis) plt.xlabel(x_axis) makedirs_safely(f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/") plt.savefig( f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/{int_cause}_xgb_nestimators_{nestimators}_maxdepth_{maxdepth}_{DATE}.pdf" )
def create_training_data(self, df, age_group_id=None): if age_group_id: write_dir = f"{self.model_dir}/{age_group_id}" else: write_dir = f"{self.model_dir}" makedirs_safely(write_dir) train_df, test_df, int_cause_df = create_train_test( df, test=self.test, int_cause=self.int_cause, icd_feature=self.icd_features, age_group_id=age_group_id, most_detailed=self.most_detailed_locs) print_log_message(f"writing train/test to df for {age_group_id}") train_df.to_csv(f"{write_dir}/train_df.csv", index=False) test_df.to_csv(f"{write_dir}/test_df.csv", index=False) int_cause_df.to_csv(f"{write_dir}/int_cause_df.csv", index=False)
def launch_create_testing_datasets(self, age_group_id=None): worker = f"/homes/agesak/thesis/analysis/create_test_datasets.py" if age_group_id: dataset_dir = f"{self.dataset_dir}/{age_group_id}" model_dir = f"{self.model_dir}/{age_group_id}" else: dataset_dir = self.dataset_dir model_dir = self.model_dir makedirs_safely(dataset_dir) numbers = (list( chunks(range(1, ModelLauncher.num_datasets + 1), ModelLauncher.numbers))) dataset_dict = dict(zip(range(0, len(numbers)), numbers)) holds_dict = {key: [] for key in dataset_dict.keys()} for batch in dataset_dict.keys(): datasets = dataset_dict[batch] hold_ids = [] for dataset_num in datasets: params = [ model_dir, dataset_dir, dataset_num, ModelLauncher.df_size_dict[f"{self.int_cause}"], self.age_feature, self.dem_feature ] jobname = f"{self.int_cause}_{self.icd_features}_dataset_{dataset_num}" jid = submit_mcod(jobname, "python", worker, cores=2, memory="12G", params=params, verbose=True, logging=True, jdrive=False, queue="long.q", holds=holds_dict[batch]) hold_ids.append(jid) if (dataset_num == datasets[-1]) & (batch != list( dataset_dict.keys())[-1]): holds_dict.update({batch + 1: hold_ids})
def main(shared_package_id, data_id, copy_version=None, skip_input_data_prep=False, vr_pull_timestamp=None, test=False): rdp_reg_dir = CONF.get_directory('rdp_regressions') outdir = "FILEPATH".format( rdp_reg_dir, shared_package_id ) makedirs_safely(outdir) makedirs_safely("FILEPATH".format(outdir)) if copy_version is not None: # sometimes all you want is a new folder to run a different model # on the same data copy_data_from_version(version, copy_version, outdir) else: run_proportions_prep( shared_package_id, outdir, vr_pull_timestamp, data_id=data_id, test=test )
def choose_best_model(int_cause, nb): """create table with evaluation metrics across all classifiers""" nb_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/{nb}/model_metrics_summary.csv" ) nb_df.rename(columns=lambda x: x + f'_{nb}' if x not in ['Evaluation metrics'] else x, inplace=True) nn_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/nn/model_metrics_summary.csv" ) nn_df.rename(columns=lambda x: x + '_nn' if x not in ['Evaluation metrics'] else x, inplace=True) rf_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/rf/model_metrics_summary.csv" ) rf_df.rename(columns=lambda x: x + '_rf' if x not in ['Evaluation metrics'] else x, inplace=True) xgb_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/xgb/model_metrics_summary.csv" ) xgb_df.rename(columns=lambda x: x + '_xgb' if x not in ['Evaluation metrics'] else x, inplace=True) df = reduce( lambda left, right: pd.merge( left, right, on=['Evaluation metrics'], how='outer'), [nb_df, nn_df, rf_df, xgb_df]) makedirs_safely(f"/home/j/temp/agesak/thesis/model_results/{DATE}/") df.to_csv( f"/home/j/temp/agesak/thesis/model_results/{DATE}/{int_cause}_model_summary.csv", index=False)
def choose_best_naive_bayes(int_cause): """determine which naive bayes classifier performed the best""" multi_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/multi_nb/model_metrics_summary.csv" ) multi_df.rename(columns=lambda x: x + '_multi_nb' if x not in ['Evaluation metrics'] else x, inplace=True) complement_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/complement_nb/model_metrics_summary.csv" ) complement_df.rename(columns=lambda x: x + '_complement_nb' if x not in ['Evaluation metrics'] else x, inplace=True) bernoulli_df = pd.read_csv( f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/bernoulli_nb/model_metrics_summary.csv" ) bernoulli_df.rename(columns=lambda x: x + '_bernoulli_nb' if x not in ['Evaluation metrics'] else x, inplace=True) df = reduce( lambda left, right: pd.merge( left, right, on=['Evaluation metrics'], how='outer'), [multi_df, complement_df, bernoulli_df]) makedirs_safely(f"/home/j/temp/agesak/thesis/model_results/{DATE}/") df.to_csv( f"/home/j/temp/agesak/thesis/model_results/{DATE}/{int_cause}_naivebayes_summary.csv", index=False) best_model = df[[x for x in list(df) if "Mean" in x]].idxmax(axis=1).iloc[0] return best_model
"""plots by cause for appendix (table 3)""" import pandas as pd from cod_prep.claude.claude_io import makedirs_safely DATE = "2020_05_23_most_detailed" for int_cause in ["x59", "y34"]: df = pd.read_csv( f"/home/j/temp/agesak/thesis/model_results/{DATE}/{DATE}_{int_cause}_nn_predictions.csv") df = df.groupby("cause_name", as_index=False).agg( {f"{int_cause}_deaths_thesis": "sum", f"{int_cause}_deaths_GBD2019": "sum"}) df[f"{int_cause}_deaths_GBD2019"] = df[f"{int_cause}_deaths_GBD2019"].round().astype(int) df["DNN by cause proportion"] = (df[f"{int_cause}_deaths_thesis"] / df[f"{int_cause}_deaths_thesis"].sum())*100 df["GBD2019 by cause proportion"] = (df[f"{int_cause}_deaths_GBD2019"] / df[f"{int_cause}_deaths_GBD2019"].sum())*100 df["DNN by cause proportion"] = df["DNN by cause proportion"].round(3).astype(str) + "%" df["GBD2019 by cause proportion"] = df["GBD2019 by cause proportion"].round(3).astype(str) + "%" df.rename(columns={"cause_name": "Cause Name", f"{int_cause}_deaths_thesis": f"{int_cause.upper()} DNN Deaths", f"{int_cause}_deaths_GBD2019": f"{int_cause.upper()} GBD2019 Deaths"}, inplace=True) makedirs_safely(f"/home/j/temp/agesak/thesis/tables/{DATE}/") df.to_csv( f"/home/j/temp/agesak/thesis/tables/{DATE}/{int_cause}_cause_table.csv", index=False)
def launch_testing_models(self, model_name, short_name, best_model_params, age_group_id=None): if age_group_id: best_model_dir = f"{self.model_dir}/{age_group_id}/{short_name}/model_{best_model_params}" dataset_dir = f"{self.dataset_dir}/{age_group_id}" testing_model_dir = f"{dataset_dir}/{short_name}" else: best_model_dir = f"{self.model_dir}/{short_name}/model_{best_model_params}" testing_model_dir = f"{self.dataset_dir}/{short_name}" dataset_dir = self.dataset_dir makedirs_safely(testing_model_dir) worker = f"/homes/agesak/thesis/analysis/run_testing_predictions.py" memory_dict = { "rf": 120, "multi_nb": 30, "bernoulli_nb": 30, "complement_nb": 30, "xgb": 40, "svm": 40, "svm_bag": 20, "nn": 50 } numbers = (list( chunks(range(1, ModelLauncher.num_datasets + 1), int(ModelLauncher.num_datasets)))) dataset_dict = dict(zip(range(0, len(numbers)), numbers)) # to just launch a few (in one batch) # numbers = [29] # dataset_dict = {} # dataset_dict[0] = numbers holds_dict = {key: [] for key in dataset_dict.keys()} for batch in dataset_dict.keys(): datasets = dataset_dict[batch] hold_ids = [] for dataset_num in datasets: remove_if_output_exists( testing_model_dir, f"dataset_{dataset_num}_summary_stats.csv") remove_if_output_exists( testing_model_dir, f"dataset_{dataset_num}_predictions.csv") params = [ best_model_dir, dataset_dir, testing_model_dir, best_model_params, self.int_cause, dataset_num, self.age_feature, self.dem_feature ] jobname = f"{model_name}_{self.int_cause}_predictions_dataset_{dataset_num}_{best_model_params}_{self.icd_features}" memory = memory_dict[short_name] if (self.int_cause == "y34") & (short_name == "nn"): memory = 150 jid = submit_mcod(jobname, "python", worker, cores=4, memory=f"{memory}G", params=params, verbose=True, logging=True, jdrive=False, queue="long.q", holds=holds_dict[batch]) hold_ids.append(jid) if (dataset_num == datasets[-1]) & (batch != list( dataset_dict.keys())[-1]): holds_dict.update({batch + 1: hold_ids})
# merge on 2019 results # df = df.merge(rd, on=["age_group_id", "sex_id", "location_id", "year_id", "cause_id"], how="left") df = df.merge(rd, on=[ "age_group_id", "sex_id", "location_id", "year_id", "cause_id" ], how="outer") df.rename(columns={ "prop": "prop_thesis", f"{int_cause}": f"{int_cause}_deaths_thesis" }, inplace=True) df = pretty_print(df) df = df.fillna(0) makedirs_safely(f"/home/j/temp/agesak/thesis/model_results/{DATE}") # redistribution number and proportions by a/s/y/country df.to_csv( f"/home/j/temp/agesak/thesis/model_results/{DATE}/{DATE}_{int_cause}_{short_name}_predictions.csv", index=False) # evaluation metrics across 500 test datasets for int_cause in ["x59", "y34"]: choose_best_model(int_cause, nb=model_dict[int_cause]) # for aggregating into 1 csv for tableau for int_cause in ["x59", "y34"]: dfs = [] for short_name in ["rf", "nn", "xgb"]: df = pd.read_csv( f"/home/j/temp/agesak/thesis/model_results/{DATE}/{DATE}_{int_cause}_{short_name}_predictions.csv"
def main(data_dir, predicted_test_dir, int_cause, short_name, model_name, age_feature, dem_feature): """Summarize evaluation metrics across 500 test datasets Refit the classifier on all observed data Predict on the unobserved data """ # determine the model's feature vector if age_feature: x_col = "cause_age_info" elif dem_feature: x_col = "dem_info" else: x_col = "cause_info" ## comment out for quick run ## summaries = read_in_summary_stats(predicted_test_dir) ## comment out for quick run ## summarize evaluation metrics across the datasets ## aggregate_evaluation_metrics(summaries, predicted_test_dir) # read in test df test_df = pd.read_csv( f"{data_dir}/test_df.csv")[DEM_COLS + ["cause_id", f"{x_col}", f"{int_cause}"]] # read in train df train_df = pd.read_csv( f"{data_dir}/train_df.csv")[DEM_COLS + ["cause_id", f"{x_col}", f"{int_cause}"]] print_log_message("read in train and test") # concat train/test to refit a model on all the observed data df = pd.concat([train_df, test_df], sort=True, ignore_index=True) print_log_message("reading in params df") param_df = pd.read_csv("/homes/agesak/thesis/maps/parameters.csv") param_df = param_df[[x for x in list(param_df) if short_name in x]] param_df[f"{short_name}"] = param_df[f"{short_name}"].str.replace( "clf__estimator__", "") ## comment out for quick run ## params = summaries.best_model_params.iloc[0] ## add for quick run params = get_best_fit(data_dir, short_name) # format best params to feed to classifier if isinstance(params, six.string_types): best_params = params.split("_") else: best_params = [params] param_kwargs = dict(zip(param_df.iloc[:, 0], best_params)) if short_name == "nn": # these feed into create_neural_network hidden_nodes_1 = int(param_kwargs["hidden_nodes_1"]) hidden_layers = int(param_kwargs["hidden_layers"]) hidden_nodes_2 = int(param_kwargs["hidden_nodes_2"]) # parameters with clf__ are only fed to keras classifier param_kwargs = {k: v for k, v in param_kwargs.items() if "clf__" in k} # ensure column dtypes are correct measure_dict = {"int": int, "float": float, "str": str} for key, value in param_kwargs.items(): dtype = param_df.loc[param_df[ f"{short_name}"] == key, f"{short_name}_dtype"].iloc[0] param_kwargs[key] = measure_dict[dtype](param_kwargs[key]) # run Neural network separately because classifier # takes secondary arguments related to build if short_name == "nn": param_kwargs = {k.replace("clf__", ""): v for k, v in param_kwargs.items() if "clf__" in k} cv = CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b") tf = cv.fit_transform(df[f"{x_col}"]) print_log_message("converting to dense matrix") tf = tf.todense() # just hard code classifer name because this only works for keras model = KerasClassifier(build_fn=create_neural_network, output_nodes=len( df.cause_id.unique()), hidden_layers=hidden_layers, hidden_nodes_1=hidden_nodes_1, hidden_nodes_2=hidden_nodes_2, **param_kwargs) print_log_message("fitting KerasClassifier") model.fit(tf, df["cause_id"].values, **param_kwargs) else: # refit all other classifiers cv = CountVectorizer(lowercase=False) tf = cv.fit_transform(df[f"{x_col}"]) print_log_message(f"fitting {model_name}") model = eval(model_name)(**param_kwargs).fit(tf, df["cause_id"]) # now predict on the unobserved data print_log_message("reading in unobserved_df") unobserved_df = pd.read_csv( f"{data_dir}/int_cause_df.csv")[DEM_COLS + ["cause_id", f"{x_col}", f"{int_cause}"]] new_counts = cv.transform(unobserved_df[f"{x_col}"]) if short_name == "nn": print_log_message("converting unobserved data to dense matrix") new_counts = new_counts.todense() unobserved_df["predictions"] = model.predict(new_counts) ## add for quick run makedirs_safely(predicted_test_dir) print_log_message("writing to df") unobserved_df.to_csv(f"{predicted_test_dir}/model_predictions.csv") joblib.dump( model, f"{predicted_test_dir}/model_fit.pkl") print_log_message("wrote model fit")