Example #1
0
def write_nrmodel_data(df, model_group, launch_set_id, cause_id=None):
    """Write input data for the noisereduction model."""
    nr_dir = CONF.get_directory('nr_process_data')
    if cause_id is not None:
        outdir = "".format(
            nrdir=nr_dir, model_group=model_group, cause_id=cause_id
        )
    else:
        outdir = "FILEPATH".format(
            nrdir=nr_dir, model_group=model_group
        )
    makedirs_safely(outdir)

    # tests all use launch set id = 0, so any existing stuff should be
    # deleted
    if launch_set_id == 0:
        del_path = "FILEPATH".format(outdir, launch_set_id)
        if os.path.exists(del_path):
            os.unlink(del_path)
        del_path = "FILEPATH".format(outdir, launch_set_id)
        if os.path.exists(del_path):
            os.unlink(del_path)

    if cause_id is not None:
        write_df = df.loc[df['cause_id'] == cause_id]
    else:
        write_df = df

    write_df.to_csv(
        "FILEPATH".format(outdir, launch_set_id), index=False)
Example #2
0
def get_limited_use_directory(source, int_cause, inj_garbage):
    """Different input directories for limited use vs. non-limited use data."""
    limited_use = "/ihme/limited_use"
    thesis = f"mcod/{int_cause}"

    limited_use_paths = {
        "TWN_MOH":
        "LIMITED_USE/PROJECT_FOLDERS/GBD/TWN/VR/",
        "MEX_INEGI":
        "IDENT/PROJECT_FOLDERS/MEX/MULTIPLE_CAUSES_OF_DEATH_INEGI/",
        "BRA_SIM":
        "LIMITED_USE/PROJECT_FOLDERS/BRA/GBD_FROM_COLLABORATORS/SIM/",
        "USA_NVSS":
        "LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_1989_2016_Y2019M02D27/1989_2016_CUSTOM_MORTALITY/"
    }
    if source in limited_use_paths.keys():
        limited_dir = os.path.join(limited_use, limited_use_paths[source],
                                   thesis)
        if inj_garbage:
            limited_dir = os.path.join(limited_dir, "inj_garbage")
    else:
        print_log_message(f"not using limited use directory for {source}")

    makedirs_safely(limited_dir)

    return limited_dir
Example #3
0
def plot_figure(df, int_cause, x_axis="max_depth", y_axis="mean_test_concordance"):
    plot = sns.FacetGrid(df, row="criterion", col="n_estimators")
    plot.map(plt.scatter, x_axis, y_axis)
    plt.subplots_adjust(hspace=0.4, wspace=0.4)
    plt.ylabel(y_axis)
    plt.xlabel(x_axis)
    makedirs_safely(f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/")
    plt.savefig(f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/{int_cause}_rf_{DATE}.pdf")
Example #4
0
    def launch_training_models(self,
                               model_name,
                               short_name,
                               model_param,
                               age_group_id=None):
        if age_group_id:
            write_dir = f"{self.model_dir}/{age_group_id}/{short_name}/model_{model_param}"
            jobname = f"{short_name}_{self.int_cause}_{model_param}_{age_group_id}"
            model_dir = f"{self.model_dir}/{age_group_id}"
        else:
            write_dir = f"{self.model_dir}/{short_name}/model_{model_param}"
            jobname = f"{short_name}_{self.icd_features}_{self.int_cause}_{model_param}"
            model_dir = self.model_dir

        memory_dict = {
            "rf": 150,
            "multi_nb": 20,
            "bernoulli_nb": 20,
            "complement_nb": 20,
            "gbt": 30,
            "xgb": 40,
            "svm": 40,
            "svm_bag": 20,
            "nn": 350
        }
        makedirs_safely(write_dir)
        # remove previous model runs
        remove_if_output_exists(write_dir, "grid_results.pkl")
        remove_if_output_exists(write_dir, "summary_stats.csv")

        params = [
            write_dir, model_dir, model_param, model_name, short_name,
            self.int_cause, self.age_feature, self.dem_feature
        ]
        worker = f"/homes/agesak/thesis/analysis/run_models.py"
        memory = memory_dict[short_name]
        if (self.int_cause == "y34") & (short_name == "rf"):
            memory = "250"
        submit_mcod(jobname,
                    "python",
                    worker,
                    cores=4,
                    memory=f"{memory}G",
                    params=params,
                    verbose=True,
                    logging=True,
                    jdrive=False,
                    queue="long.q",
                    runtime=ModelLauncher.runtime_dict[short_name])
Example #5
0
def plot_figure(df,
                int_cause,
                x_axis="learning_rate",
                y_axis="mean_test_concordance",
                maxdepth=None,
                nestimators=None):
    plot = sns.FacetGrid(df, row="gamma", col="subsample")
    plot.map(plt.scatter, x_axis, y_axis)
    plt.subplots_adjust(hspace=0.4, wspace=0.4)
    plt.ylabel(y_axis)
    plt.xlabel(x_axis)
    makedirs_safely(f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/")
    plt.savefig(
        f"/home/j/temp/agesak/thesis/gridsearch_plots/{DATE}/{int_cause}_xgb_nestimators_{nestimators}_maxdepth_{maxdepth}_{DATE}.pdf"
    )
Example #6
0
 def create_training_data(self, df, age_group_id=None):
     if age_group_id:
         write_dir = f"{self.model_dir}/{age_group_id}"
     else:
         write_dir = f"{self.model_dir}"
     makedirs_safely(write_dir)
     train_df, test_df, int_cause_df = create_train_test(
         df,
         test=self.test,
         int_cause=self.int_cause,
         icd_feature=self.icd_features,
         age_group_id=age_group_id,
         most_detailed=self.most_detailed_locs)
     print_log_message(f"writing train/test to df for {age_group_id}")
     train_df.to_csv(f"{write_dir}/train_df.csv", index=False)
     test_df.to_csv(f"{write_dir}/test_df.csv", index=False)
     int_cause_df.to_csv(f"{write_dir}/int_cause_df.csv", index=False)
Example #7
0
    def launch_create_testing_datasets(self, age_group_id=None):

        worker = f"/homes/agesak/thesis/analysis/create_test_datasets.py"
        if age_group_id:
            dataset_dir = f"{self.dataset_dir}/{age_group_id}"
            model_dir = f"{self.model_dir}/{age_group_id}"
        else:
            dataset_dir = self.dataset_dir
            model_dir = self.model_dir
        makedirs_safely(dataset_dir)

        numbers = (list(
            chunks(range(1, ModelLauncher.num_datasets + 1),
                   ModelLauncher.numbers)))
        dataset_dict = dict(zip(range(0, len(numbers)), numbers))
        holds_dict = {key: [] for key in dataset_dict.keys()}
        for batch in dataset_dict.keys():
            datasets = dataset_dict[batch]
            hold_ids = []
            for dataset_num in datasets:
                params = [
                    model_dir, dataset_dir, dataset_num,
                    ModelLauncher.df_size_dict[f"{self.int_cause}"],
                    self.age_feature, self.dem_feature
                ]
                jobname = f"{self.int_cause}_{self.icd_features}_dataset_{dataset_num}"
                jid = submit_mcod(jobname,
                                  "python",
                                  worker,
                                  cores=2,
                                  memory="12G",
                                  params=params,
                                  verbose=True,
                                  logging=True,
                                  jdrive=False,
                                  queue="long.q",
                                  holds=holds_dict[batch])
                hold_ids.append(jid)
                if (dataset_num == datasets[-1]) & (batch != list(
                        dataset_dict.keys())[-1]):
                    holds_dict.update({batch + 1: hold_ids})
def main(shared_package_id, data_id, copy_version=None,
         skip_input_data_prep=False, vr_pull_timestamp=None,
         test=False):

    rdp_reg_dir = CONF.get_directory('rdp_regressions')

    outdir = "FILEPATH".format(
        rdp_reg_dir, shared_package_id
    )
    makedirs_safely(outdir)
    makedirs_safely("FILEPATH".format(outdir))

    if copy_version is not None:
        # sometimes all you want is a new folder to run a different model
        # on the same data
        copy_data_from_version(version, copy_version, outdir)
    else:
        run_proportions_prep(
            shared_package_id, outdir,
            vr_pull_timestamp, data_id=data_id, test=test
        )
Example #9
0
def choose_best_model(int_cause, nb):
    """create table with evaluation metrics across all classifiers"""

    nb_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/{nb}/model_metrics_summary.csv"
    )
    nb_df.rename(columns=lambda x: x + f'_{nb}'
                 if x not in ['Evaluation metrics'] else x,
                 inplace=True)

    nn_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/nn/model_metrics_summary.csv"
    )
    nn_df.rename(columns=lambda x: x + '_nn'
                 if x not in ['Evaluation metrics'] else x,
                 inplace=True)

    rf_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/rf/model_metrics_summary.csv"
    )
    rf_df.rename(columns=lambda x: x + '_rf'
                 if x not in ['Evaluation metrics'] else x,
                 inplace=True)

    xgb_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/xgb/model_metrics_summary.csv"
    )
    xgb_df.rename(columns=lambda x: x + '_xgb'
                  if x not in ['Evaluation metrics'] else x,
                  inplace=True)

    df = reduce(
        lambda left, right: pd.merge(
            left, right, on=['Evaluation metrics'], how='outer'),
        [nb_df, nn_df, rf_df, xgb_df])

    makedirs_safely(f"/home/j/temp/agesak/thesis/model_results/{DATE}/")
    df.to_csv(
        f"/home/j/temp/agesak/thesis/model_results/{DATE}/{int_cause}_model_summary.csv",
        index=False)
Example #10
0
def choose_best_naive_bayes(int_cause):
    """determine which naive bayes classifier performed the best"""

    multi_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/multi_nb/model_metrics_summary.csv"
    )
    multi_df.rename(columns=lambda x: x + '_multi_nb'
                    if x not in ['Evaluation metrics'] else x,
                    inplace=True)

    complement_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/complement_nb/model_metrics_summary.csv"
    )
    complement_df.rename(columns=lambda x: x + '_complement_nb'
                         if x not in ['Evaluation metrics'] else x,
                         inplace=True)

    bernoulli_df = pd.read_csv(
        f"/ihme/cod/prep/mcod/process_data/{int_cause}/thesis/sample_dirichlet/{DATE}/bernoulli_nb/model_metrics_summary.csv"
    )
    bernoulli_df.rename(columns=lambda x: x + '_bernoulli_nb'
                        if x not in ['Evaluation metrics'] else x,
                        inplace=True)

    df = reduce(
        lambda left, right: pd.merge(
            left, right, on=['Evaluation metrics'], how='outer'),
        [multi_df, complement_df, bernoulli_df])

    makedirs_safely(f"/home/j/temp/agesak/thesis/model_results/{DATE}/")
    df.to_csv(
        f"/home/j/temp/agesak/thesis/model_results/{DATE}/{int_cause}_naivebayes_summary.csv",
        index=False)
    best_model = df[[x for x in list(df)
                     if "Mean" in x]].idxmax(axis=1).iloc[0]

    return best_model
Example #11
0
"""plots by cause for appendix (table 3)"""

import pandas as pd
from cod_prep.claude.claude_io import makedirs_safely

DATE = "2020_05_23_most_detailed"

for int_cause in ["x59", "y34"]:

    df = pd.read_csv(
        f"/home/j/temp/agesak/thesis/model_results/{DATE}/{DATE}_{int_cause}_nn_predictions.csv")

    df = df.groupby("cause_name", as_index=False).agg(
    {f"{int_cause}_deaths_thesis": "sum", f"{int_cause}_deaths_GBD2019": "sum"})
    df[f"{int_cause}_deaths_GBD2019"] = df[f"{int_cause}_deaths_GBD2019"].round().astype(int)
    df["DNN by cause proportion"] = (df[f"{int_cause}_deaths_thesis"] / df[f"{int_cause}_deaths_thesis"].sum())*100
    df["GBD2019 by cause proportion"] = (df[f"{int_cause}_deaths_GBD2019"] / df[f"{int_cause}_deaths_GBD2019"].sum())*100
    df["DNN by cause proportion"] = df["DNN by cause proportion"].round(3).astype(str) + "%"
    df["GBD2019 by cause proportion"] = df["GBD2019 by cause proportion"].round(3).astype(str) + "%"
    df.rename(columns={"cause_name": "Cause Name", f"{int_cause}_deaths_thesis": f"{int_cause.upper()} DNN Deaths",
                      f"{int_cause}_deaths_GBD2019": f"{int_cause.upper()} GBD2019 Deaths"}, inplace=True)
    makedirs_safely(f"/home/j/temp/agesak/thesis/tables/{DATE}/")
    df.to_csv(
        f"/home/j/temp/agesak/thesis/tables/{DATE}/{int_cause}_cause_table.csv", index=False)
Example #12
0
    def launch_testing_models(self,
                              model_name,
                              short_name,
                              best_model_params,
                              age_group_id=None):

        if age_group_id:
            best_model_dir = f"{self.model_dir}/{age_group_id}/{short_name}/model_{best_model_params}"
            dataset_dir = f"{self.dataset_dir}/{age_group_id}"
            testing_model_dir = f"{dataset_dir}/{short_name}"
        else:
            best_model_dir = f"{self.model_dir}/{short_name}/model_{best_model_params}"
            testing_model_dir = f"{self.dataset_dir}/{short_name}"
            dataset_dir = self.dataset_dir

        makedirs_safely(testing_model_dir)
        worker = f"/homes/agesak/thesis/analysis/run_testing_predictions.py"
        memory_dict = {
            "rf": 120,
            "multi_nb": 30,
            "bernoulli_nb": 30,
            "complement_nb": 30,
            "xgb": 40,
            "svm": 40,
            "svm_bag": 20,
            "nn": 50
        }

        numbers = (list(
            chunks(range(1, ModelLauncher.num_datasets + 1),
                   int(ModelLauncher.num_datasets))))
        dataset_dict = dict(zip(range(0, len(numbers)), numbers))
        # to just launch a few (in one batch)
        # numbers = [29]
        # dataset_dict = {}
        # dataset_dict[0] = numbers
        holds_dict = {key: [] for key in dataset_dict.keys()}
        for batch in dataset_dict.keys():
            datasets = dataset_dict[batch]
            hold_ids = []
            for dataset_num in datasets:
                remove_if_output_exists(
                    testing_model_dir,
                    f"dataset_{dataset_num}_summary_stats.csv")
                remove_if_output_exists(
                    testing_model_dir,
                    f"dataset_{dataset_num}_predictions.csv")
                params = [
                    best_model_dir, dataset_dir, testing_model_dir,
                    best_model_params, self.int_cause, dataset_num,
                    self.age_feature, self.dem_feature
                ]
                jobname = f"{model_name}_{self.int_cause}_predictions_dataset_{dataset_num}_{best_model_params}_{self.icd_features}"
                memory = memory_dict[short_name]
                if (self.int_cause == "y34") & (short_name == "nn"):
                    memory = 150
                jid = submit_mcod(jobname,
                                  "python",
                                  worker,
                                  cores=4,
                                  memory=f"{memory}G",
                                  params=params,
                                  verbose=True,
                                  logging=True,
                                  jdrive=False,
                                  queue="long.q",
                                  holds=holds_dict[batch])
                hold_ids.append(jid)
                if (dataset_num == datasets[-1]) & (batch != list(
                        dataset_dict.keys())[-1]):
                    holds_dict.update({batch + 1: hold_ids})
Example #13
0
        # merge on 2019 results
        # df = df.merge(rd, on=["age_group_id", "sex_id", "location_id", "year_id", "cause_id"], how="left")
        df = df.merge(rd,
                      on=[
                          "age_group_id", "sex_id", "location_id", "year_id",
                          "cause_id"
                      ],
                      how="outer")
        df.rename(columns={
            "prop": "prop_thesis",
            f"{int_cause}": f"{int_cause}_deaths_thesis"
        },
                  inplace=True)
        df = pretty_print(df)
        df = df.fillna(0)
        makedirs_safely(f"/home/j/temp/agesak/thesis/model_results/{DATE}")
        # redistribution number and proportions by a/s/y/country
        df.to_csv(
            f"/home/j/temp/agesak/thesis/model_results/{DATE}/{DATE}_{int_cause}_{short_name}_predictions.csv",
            index=False)

# evaluation metrics across 500 test datasets
for int_cause in ["x59", "y34"]:
    choose_best_model(int_cause, nb=model_dict[int_cause])

# for aggregating into 1 csv for tableau
for int_cause in ["x59", "y34"]:
    dfs = []
    for short_name in ["rf", "nn", "xgb"]:
        df = pd.read_csv(
            f"/home/j/temp/agesak/thesis/model_results/{DATE}/{DATE}_{int_cause}_{short_name}_predictions.csv"
Example #14
0
def main(data_dir, predicted_test_dir, int_cause, short_name,
         model_name, age_feature, dem_feature):
    """Summarize evaluation metrics across 500 test datasets
       Refit the classifier on all observed data
       Predict on the unobserved data
    """

    # determine the model's feature vector
    if age_feature:
        x_col = "cause_age_info"
    elif dem_feature:
        x_col = "dem_info"
    else:
        x_col = "cause_info"

    ## comment out for quick run
    ## summaries = read_in_summary_stats(predicted_test_dir)

    ## comment out for quick run
    ## summarize evaluation metrics across the datasets
    ## aggregate_evaluation_metrics(summaries, predicted_test_dir)

    # read in test df
    test_df = pd.read_csv(
        f"{data_dir}/test_df.csv")[DEM_COLS + ["cause_id",
                                               f"{x_col}",
                                               f"{int_cause}"]]
    # read in train df
    train_df = pd.read_csv(
        f"{data_dir}/train_df.csv")[DEM_COLS + ["cause_id",
                                                f"{x_col}",
                                                f"{int_cause}"]]
    print_log_message("read in train and test")
    # concat train/test to refit a model on all the observed data
    df = pd.concat([train_df, test_df], sort=True, ignore_index=True)

    print_log_message("reading in params df")
    param_df = pd.read_csv("/homes/agesak/thesis/maps/parameters.csv")
    param_df = param_df[[x for x in list(param_df) if short_name in x]]
    param_df[f"{short_name}"] = param_df[f"{short_name}"].str.replace(
        "clf__estimator__", "")
    ## comment out for quick run
    ## params = summaries.best_model_params.iloc[0]
    ## add for quick run
    params = get_best_fit(data_dir, short_name)

    # format best params to feed to classifier
    if isinstance(params, six.string_types):
        best_params = params.split("_")
    else:
        best_params = [params]

    param_kwargs = dict(zip(param_df.iloc[:, 0], best_params))
    if short_name == "nn":
        # these feed into create_neural_network
        hidden_nodes_1 = int(param_kwargs["hidden_nodes_1"])
        hidden_layers = int(param_kwargs["hidden_layers"])
        hidden_nodes_2 = int(param_kwargs["hidden_nodes_2"])
        # parameters with clf__ are only fed to keras classifier
        param_kwargs = {k: v for k, v in param_kwargs.items() if "clf__" in k}

    # ensure column dtypes are correct
    measure_dict = {"int": int, "float": float, "str": str}
    for key, value in param_kwargs.items():
        dtype = param_df.loc[param_df[
            f"{short_name}"] == key, f"{short_name}_dtype"].iloc[0]
        param_kwargs[key] = measure_dict[dtype](param_kwargs[key])

    # run Neural network separately because classifier
    # takes secondary arguments related to build
    if short_name == "nn":
        param_kwargs = {k.replace("clf__", ""): v for k,
                        v in param_kwargs.items() if "clf__" in k}
        cv = CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b")
        tf = cv.fit_transform(df[f"{x_col}"])
        print_log_message("converting to dense matrix")
        tf = tf.todense()
        # just hard code classifer name because this only works for keras
        model = KerasClassifier(build_fn=create_neural_network,
                                output_nodes=len(
                                    df.cause_id.unique()),
                                hidden_layers=hidden_layers,
                                hidden_nodes_1=hidden_nodes_1,
                                hidden_nodes_2=hidden_nodes_2, **param_kwargs)
        print_log_message("fitting KerasClassifier")
        model.fit(tf, df["cause_id"].values, **param_kwargs)
    else:
        # refit all other classifiers
        cv = CountVectorizer(lowercase=False)
        tf = cv.fit_transform(df[f"{x_col}"])
        print_log_message(f"fitting {model_name}")
        model = eval(model_name)(**param_kwargs).fit(tf, df["cause_id"])

    # now predict on the unobserved data
    print_log_message("reading in unobserved_df")

    unobserved_df = pd.read_csv(
        f"{data_dir}/int_cause_df.csv")[DEM_COLS + ["cause_id",
                                                    f"{x_col}",
                                                    f"{int_cause}"]]
    new_counts = cv.transform(unobserved_df[f"{x_col}"])
    if short_name == "nn":
        print_log_message("converting unobserved data to dense matrix")
        new_counts = new_counts.todense()
    unobserved_df["predictions"] = model.predict(new_counts)

    ## add for quick run
    makedirs_safely(predicted_test_dir)

    print_log_message("writing to df")
    unobserved_df.to_csv(f"{predicted_test_dir}/model_predictions.csv")
    joblib.dump(
        model, f"{predicted_test_dir}/model_fit.pkl")
    print_log_message("wrote model fit")