Beispiel #1
0
def create_neural_network(output_nodes, hidden_layers, hidden_nodes_1,
                          hidden_nodes_2):
    if hidden_nodes_2:
        hidden_nodes = [hidden_nodes_1, hidden_nodes_2]
    else:
        hidden_nodes = [hidden_nodes_1]
    node_dict = dict(zip(range(0, hidden_layers), hidden_nodes))
    model = Sequential()
    for layer in range(0, hidden_layers):
        print_log_message(f"adding {layer} layer")
        model.add(Dense(node_dict[layer], activation="relu"))
    model.add(Dense(output_nodes, activation="softmax"))
    print_log_message("compiling model")
    model.compile(optimizer="adam", loss="categorical_crossentropy")
    return model
Beispiel #2
0
    def get_computed_dataframe(self):
        """Return computations."""
        split_type = self.needs_splitting()
        if not split_type:
            print_log_message("No location splitting required.")
            self.diag_df = None
            return self.df

        assert 'sample_size' in self.df.columns

        start_deaths = (self.df.sample_size * self.df.cf).sum()

        if split_type == "UKR":
            env_wide = self.prep_envelope(split_type)

            df = self.adjust_ukr(env_wide, split_type)

            df = pd.concat([df, self.df], ignore_index=True)

            end_deaths = (df.sample_size * df.cf).sum()
            assert np.isclose((end_deaths / start_deaths), 2, atol=0.05)
        else:
            df_list = []
            for split_type in ["urban", "rural"]:
                env_wide = self.prep_envelope(split_type)

                orig_id = self.split_ids[split_type][0]
                new_id = self.split_ids[split_type][1]

                df_ap = self.df.loc[self.df['location_id'] == orig_id]

                df = self.adjust_ap_telangana(orig_id, new_id, env_wide, df_ap)

                df_list.append(df)

            ap_ids = [self.split_ids["urban"][0]
                      ] + [self.split_ids["rural"][0]]
            df_no_ap = self.df.loc[~(self.df['location_id'].isin(ap_ids))]
            df_list.append(df_no_ap)

            df = pd.concat(df_list, ignore_index=True)

            end_deaths = (df.sample_size * df.cf).sum()
            assert np.isclose(start_deaths, end_deaths, rtol=0.001)

        df = df[self.orig_cols]

        return df
Beispiel #3
0
def create_train_test(df, test, int_cause, icd_feature, age_group_id,
                      most_detailed):
    """Create train/test datasets, if running tests,
    randomly sample from all locations so models don't take forever to run"""
    locs = get_location_metadata(gbd_round_id=6, location_set_id=35)

    # identify column corresponding to ICD attributes of interest
    icd_col = f"{icd_feature}_cause_info"

    keep_cols = DEM_COLS + [icd_col, int_cause
                            ] + [x for x in list(df) if "multiple_cause" in x]

    df = df.loc[(df.age_group_id != 283) & (df.age_group_id != 160)]
    df = df[keep_cols]
    df = create_age_bins(df, AGG_AGES)
    df = drop_age_restricted_cols(df)
    if not most_detailed:
        print_log_message("aggregating to country level")
        df = get_country_names(df)
    if age_group_id:
        print_log_message(f"subsetting to just age group id {age_group_id}")
        df = df.loc[df["age_group_id"] == age_group_id]
        print_log_message(f"resulting df is {len(df)} rows")
    df["cause_age_info"] = df[[icd_col, "age_group_id"
                               ]].astype(str).apply(lambda x: " ".join(x),
                                                    axis=1)
    df["dem_info"] = df[[
        icd_col, "location_id", "sex_id", "year_id", "age_group_id"
    ]].astype(str).apply(lambda x: " ".join(x), axis=1)

    garbage_df = df.query(f"cause_id==743 & {int_cause}==1")
    df = df.query(f"cause_id!=743 & {int_cause}!=1")

    if test:
        print_log_message(
            "THIS IS A TEST.. only using 5000 rows from each loc")
        df = df.merge(locs[["location_id", "parent_id", "level"]],
                      on="location_id",
                      how="left")
        # map subnationals to parent so
        # random sampling will be at country level
        df["location_id"] = np.where(df["level"] > 3, df["parent_id"],
                                     df["location_id"])
        df.drop(columns=["parent_id", "level"], inplace=True)
        # get a random sample from each location
        # bc full dataset takes forever to run
        dfs = []
        for loc in list(df.location_id.unique()):
            subdf = df.query(f"location_id=={loc}")
            random_df = subdf.sample(n=7000, replace=False)
            dfs.append(random_df)
        df = pd.concat(dfs, ignore_index=True, sort=True)

    # split train 75%, test 25%
    train_df, test_df = train_test_split(df, test_size=0.25)

    return train_df, test_df, garbage_df
Beispiel #4
0
 def create_training_data(self, df, age_group_id=None):
     if age_group_id:
         write_dir = f"{self.model_dir}/{age_group_id}"
     else:
         write_dir = f"{self.model_dir}"
     makedirs_safely(write_dir)
     train_df, test_df, int_cause_df = create_train_test(
         df,
         test=self.test,
         int_cause=self.int_cause,
         icd_feature=self.icd_features,
         age_group_id=age_group_id,
         most_detailed=self.most_detailed_locs)
     print_log_message(f"writing train/test to df for {age_group_id}")
     train_df.to_csv(f"{write_dir}/train_df.csv", index=False)
     test_df.to_csv(f"{write_dir}/test_df.csv", index=False)
     int_cause_df.to_csv(f"{write_dir}/int_cause_df.csv", index=False)
Beispiel #5
0
def create_test_datatsets(test_df, dirichlet_dict, write_dir, dataset_num,
                          df_size, age_feature, dem_feature):
    """Generate a test dataset of same length as the original test dataset
    Arguments:
        test_df: the actual test dataframe
        dirichlet_dict: dictionary mapping each cause id in actual test data
                        to its respective proportion in the test data
                        (generated from a Dirichlet distribution)
        write_dir: a directory to write each dataset to
        dataset_num: which dataset (of the 500) to create
        df_size: from the ModelLauncher, the desired size of the generated test
                 data (should be same size as the actual test data)
        age_feature: (Bool) - Do you want to include age as a feature?
        dem_feature: (Bool) - Do you want to include all demographic cols
                            (age, sex, year, and location) as features?
    """
    # create df of desired length
    df = pd.DataFrame({"cause_id": [np.NaN] * df_size})
    dfs = []
    # loop through each cause and generate rows with
    # multiple cause and demographic information
    for cause in dirichlet_dict.keys():
        # proportion from dirichlet dictates how many
        # rows are assigned to a given cause
        subdf = df.sample(frac=dirichlet_dict[cause],
                          replace=True).assign(cause_id=cause)
        print_log_message(f"generating multiple cause rows for {cause}")
        mcause_df = generate_multiple_cause_rows(subdf, test_df, cause,
                                                 age_feature, dem_feature)
        dfs.append(mcause_df)

    # if rerunning, remove previous dataset information
    remove_if_output_exists(write_dir, f"dataset_{dataset_num}.csv")
    remove_if_output_exists(
        write_dir, f"dataset_{dataset_num}_dirichlet_distribution.pkl")

    dfs = pd.concat(dfs, sort=True, ignore_index=True)
    print_log_message(f"writing dataset {dataset_num} to a df")
    # write generated test dataset to csv
    dfs.to_csv(f"{write_dir}/dataset_{dataset_num}.csv", index=False)
    # save randomly generated dirichlet distribution
    # in case need to exactly replicate
    joblib.dump(
        dirichlet_dict,
        f"{write_dir}/dataset_{dataset_num}_dirichlet_distribution.pkl")
    def __init__(self, code_system_id, cause_set_version_id):
        self.code_system_id = code_system_id

        misc.print_log_message("Getting metadata")
        CONF = Configurator()
        self.art_path = CONF.get_resource("age_restriction_targets")

        self.age_df = ages.get_cod_ages(**self.standard_cache_options)

        self.cause_meta_df = causes.get_current_cause_hierarchy(
            cause_set_version_id=cause_set_version_id,
            **self.standard_cache_options)

        if code_system_id in self.allowed_code_system_ids:
            misc.print_log_message(
                "Creating age restriction mapping dataframe")
            invalid_ages_df = self.get_invalid_ages_df()
            misc.print_log_message("Creating for {}".format({
                1: "ICD10",
                6: "ICD9_detail"
            }[code_system_id]))
            art_df = self.read_age_restriction_targets_df(code_system_id)
            self.art_mapping_df = self.get_age_restriction_target_mapping_df(
                art_df, invalid_ages_df)
        else:
            self.art_mapping_df = None
Beispiel #7
0
def drop_age_restricted_cols(df):
    start = len(df)
    age_meta_df = get_ages(force_rerun=False, block_rerun=True)
    # secret causes in restrictions
    cause_meta_df = get_current_cause_hierarchy(cause_set_id=4,
                                                **{
                                                    'block_rerun': True,
                                                    'force_rerun': False
                                                })
    restrict_df = pd.read_csv(
        "/homes/agesak/thesis/maps/injuries_overrides.csv")
    restrict_df = add_cause_metadata(restrict_df,
                                     add_cols='cause_id',
                                     merge_col='acause',
                                     cause_meta_df=cause_meta_df)
    restrict_df["age_start_group"] = restrict_df["age_start_group"].fillna(0)

    orig_cols = df.columns
    df = add_age_metadata(
        df,
        add_cols=['age_group_years_start', 'age_group_years_end'],
        age_meta_df=age_meta_df)

    df = df.merge(restrict_df, on='cause_id', how='left')

    # age_group_years_end is weird, 0-14 means age_group_years_end 15
    too_young = df["age_group_years_end"] <= df["age_start_group"]
    too_old = df["age_group_years_start"] > df["age_end_group"]

    df = df[~(too_young | too_old)]
    df = df[orig_cols]
    end = len(df)
    print_log_message(
        f"dropping {start - end} cols that violate age restrictions")

    return df
Beispiel #8
0
def read_in_data(int_cause, inj_garbage=False, code_system_id=None):
    """Read in and append all MCoD data"""
    # col and ita dont have icd 9
    print_log_message("reading in not limited use data")
    if inj_garbage:
        print_log_message(
            "reading in formatted df with only nonX59/Y34 garbage codes as UCOD"
        )
        subdirs = f"{int_cause}/thesis/inj_garbage"
    else:
        subdirs = f"{int_cause}/thesis"
    # it"s not good the sources are hard-coded
    if code_system_id != 6:
        # col and ita dont have icd 9
        udf = get_mcause_data(phase="format_map",
                              source=["COL_DANE", "ITA_ISTAT"],
                              sub_dirs=subdirs,
                              data_type_id=9,
                              code_system_id=code_system_id,
                              assert_all_available=True,
                              verbose=True,
                              **BLOCK_RERUN)
    else:
        udf = pd.DataFrame()

    print_log_message("reading in limited use data")
    datasets = get_datasets(
        **{
            "force_rerun": True,
            "block_rerun": False,
            "source": MCauseLauncher.limited_sources,
            "code_system_id": code_system_id
        })
    limited_metadata = datasets.apply(
        lambda x: str(x['nid']) + "_" + str(x['extract_type_id']),
        axis=1).values

    dfs = []
    for source in MCauseLauncher.limited_sources:
        limited_dir = get_limited_use_directory(source, int_cause, inj_garbage)
        csvfiles = glob.glob(os.path.join(limited_dir, "*.csv"))
        for file in csvfiles:
            if any(meta in file for meta in limited_metadata):
                df = pd.read_csv(file)
                dfs.append(df)
    ldf = pd.concat(dfs, ignore_index=True, sort=True)
    df = pd.concat([udf, ldf], sort=True, ignore_index=True)

    return df
Beispiel #9
0
def main(model_param, model_name, write_dir, train_dir, int_cause, short_name,
         age_feature, dem_feature):
    """Run gridsearch pipeline for a given classifier
    * Note this script is parallelized by parameter set
    (to allow for feasible run times)
    so each gridsearch object is fed only 1 set of model
    parameters, but this is done over a range of parameters
    Arguments:
        model_param: (str) - a single set of model parameters for
                     a given classifier
        model_name: the classifier name as defined by SciKit Learn
        write_dir: a directory to write the model object and summary to
        train_dir: a directory where the training dataset lives
        int_cause: the injuries garbage code of interest
        short_name: the abbreviated name for each classifier
                    defined in the ModelLauncher
        age_feature: (Bool) - Do you want to include age as a feature?
        dem_feature: (Bool) - Do you want to include all demographic cols
                            (age, sex, year, and location) as features?
    """
    # determine the model's feature vector
    if age_feature:
        x_col = "cause_age_info"
    elif dem_feature:
        x_col = "dem_info"
    else:
        x_col = "cause_info"

    print_log_message("reading in data")
    model_df = pd.read_csv(f"{train_dir}/train_df.csv")[[
        "cause_id", f"{x_col}", f"{int_cause}"
    ]]
    print_log_message("formatting parameters")
    model_params = format_gridsearch_params(short_name, model_param)

    print_log_message("running pipeline")
    results, grid_results = run_pipeline(model_name, short_name, model_df,
                                         model_params, write_dir, int_cause,
                                         age_feature, dem_feature)
    results.to_csv(f"{write_dir}/summary_stats.csv", index=False)
    joblib.dump(grid_results, f"{write_dir}/grid_results.pkl")
Beispiel #10
0
    def get_computed_dataframe(self):
        """Main method to execute computations and return result.

        Notes:
        UNDECIDED HOW TO DO THIS WITHOUT ALL YEARS IN MEMORY LIKE STATA HAD

        Potential solutions:
        1. Don't do this at all, just correct ANY cause-age-sex-location-year
            that exceeds the global reference rate
              - this would potentially change results slightly, but does not
                seem unreasonable, and in fact seems more correct

        2. Prime HIV correction by assembling the list ahead of time
              - might take a long time and need to be rerun every time, which
                would essentially double the required time for this step
              - advantage is that it mimics last years results without needing
                any additional years of data
              - could eliminate some of the problems with this method by
                running it very infrequently instead of every time
                the data changes

        3. Take a 'source' argument in the class and pull the other data that
            we pulled last year to pool years necessary to generate this list

        4. Run HIV correction with all the data for a 'source' altogether, like
            the stata code did, but still update versions based on nid-year

        FOR NOW: Follow method 1 and expect to test the similarity later
        """
        keep_cols = self.df.columns

        if not self.country_needs_correction():
            print_log_message("Country doesn't need hiv correction")
            self.diag_df = None
            return self.df

        print_log_message("Getting rates df")
        rates_df = self.get_rates_df(self.cause_meta_df)
        if self.correct_garbage:
            df = add_code_metadata(self.df,
                                   add_cols=['value'],
                                   code_system_id=self.code_system_id,
                                   force_rerun=False,
                                   block_rerun=True,
                                   cache_dir=self.cache_dir)
            df = self.identify_sepsis_gc(df, self.code_system_id)
            df = self.identify_injury_gc(df, self.code_system_id)
            df = self.identify_hivrd_gc(df, self.code_system_id)
            # do a groupby to collapse down to cause_id level for next steps
            group_cols = [
                x for x in keep_cols if x not in ['code_id', 'deaths']
            ]
            df_by_code = df.copy()
            df_by_cause = df.groupby(group_cols,
                                     as_index=False)['deaths'].sum()
        else:
            df_by_cause = self.df
        df = add_population(df_by_cause, pop_df=self.pop_df)
        print_log_message("Flagging correct dem groups for "
                          "{0} rows of data".format(len(df)))
        df = flag_correct_dem_groups(df,
                                     self.code_system_id,
                                     self.cause_meta_df,
                                     self.loc_meta_df,
                                     self.age_meta_df,
                                     rates_df,
                                     self.reference_ages,
                                     self.move_gc_age_restrictions,
                                     self.value_cols,
                                     self.pop_col,
                                     self.cause_selections_path,
                                     correct_garbage=self.correct_garbage)
        cause_to_targets_map = self.get_cause_to_targets_map(
            self.cause_meta_df)
        print_log_message("Identifying positive excess")
        df = identify_positive_excess(df, rates_df, cause_to_targets_map,
                                      self.reference_ages, self.loc_meta_df,
                                      self.cause_meta_df, self.value_cols,
                                      self.pop_col, self.correct_garbage)
        if self.correct_garbage:
            df = self.calculate_garbage_positive_excess(
                df, df_by_code, group_cols)
            print_log_message("Moving excess to target")
            df = move_excess_to_target(df, self.value_cols,
                                       cause_to_targets_map,
                                       self.correct_garbage)
            computed_df = assign_code_to_created_target_deaths(
                df, self.code_system_id, self.cause_meta_df)
        else:
            print_log_message("Moving excess to target")
            computed_df = move_excess_to_target(df, self.value_cols,
                                                cause_to_targets_map,
                                                self.correct_garbage)
        self.diag_df = computed_df
        return computed_df[keep_cols]
Beispiel #11
0
    def launch(self):
        if self.phase == "train_test":
            df = read_in_data(int_cause=self.int_cause,
                              code_system_id=self.code_system_id)
            if self.by_age:
                for age_group_id in ModelLauncher.agg_ages:
                    print_log_message(f"working on age: {age_group_id}")
                    self.create_training_data(df, age_group_id)
            else:
                self.create_training_data(df)

        if self.phase == "launch_training_model":
            for short_name in self.model_types:
                model_name = ModelLauncher.model_dict[short_name]
                if model_name in [
                        "MultinomialNB", "BernoulliNB", "ComplementNB"
                ]:
                    params = naive_bayes_params(short_name)
                else:
                    get_params = getattr(
                        import_module(f"thesis_utils.modeling"),
                        f"{short_name}_params")
                    params = get_params(short_name)
                print_log_message(f"launching {model_name}")
                print_log_message(f"{len(params)} sets of model parameters")
                for parameter in params:
                    param = format_argparse_params(
                        parameter, ModelLauncher.param_dict[short_name])
                    if self.by_age:
                        for age_group_id in ModelLauncher.agg_ages:
                            print_log_message(
                                f"launching models for age: {age_group_id}")
                            self.launch_training_models(
                                model_name, short_name, param, age_group_id)
                    else:
                        self.launch_training_models(model_name, short_name,
                                                    param)

        if self.phase == "create_test_datasets":
            if self.by_age:
                for age_group_id in ModelLauncher.agg_ages:
                    print_log_message(f"working on age: {age_group_id}")
                    self.launch_create_testing_datasets(age_group_id)
            else:
                self.launch_create_testing_datasets()

        if self.phase == "launch_testing_models":
            for short_name in self.model_types:
                model_name = ModelLauncher.model_dict[short_name]
                # get parameters of best model fit for given model
                if self.by_age:
                    for age_group_id in ModelLauncher.agg_ages:
                        best_model_params = self.get_best_model(
                            short_name, age_group_id)
                        self.launch_testing_models(model_name, short_name,
                                                   best_model_params,
                                                   age_group_id)
                else:
                    best_model_params = self.get_best_model(short_name,
                                                            age_group_id=None)
                    self.launch_testing_models(model_name,
                                               short_name,
                                               best_model_params,
                                               age_group_id=None)

        if self.phase == "launch_int_cause_predictions":
            for short_name in self.model_types:
                if self.by_age:
                    for age_group_id in ModelLauncher.agg_ages:
                        self.launch_int_cause_predictions(
                            short_name=short_name, age_group_id=age_group_id)
                else:
                    self.launch_int_cause_predictions(short_name=short_name,
                                                      age_group_id=None)
Beispiel #12
0

# will only need to run this once ever tbh
# for int_cause in ["x59", "y34"]:
#     rd = format_gbd_results(int_cause)
#     rd = pretty_print(rd)
#     rd.to_csv(f"/home/j/temp/agesak/thesis/model_results/{int_cause}_gbd_2019.csv", index=False)

model_dict = {"x59": "", "y34": ""}

# inconsistency here with short name for naive bayes
# here short name is "nb" until the best naive bayes
# is identified (then short name will be either
# multi_nb, bernoulli_nb, or complement_nb like normal)
for int_cause in ["x59", "y34"]:
    print_log_message(f"working on {int_cause}")
    for short_name in ["rf", "nb", "xgb", "nn"]:
        if short_name == "nb":
            update_model_dict(int_cause)
            # get the short name associated with the best naive bayes model
            short_name = model_dict[int_cause]
        print_log_message(f"working on {short_name}")
        df = format_classifier_results(int_cause, short_name)
        rd = format_gbd_results(int_cause)
        rd.rename(columns={
            "prop": "prop_GBD2019",
            f"{int_cause}": f"{int_cause}_deaths_GBD2019"
        },
                  inplace=True)
        # merge on 2019 results
        # df = df.merge(rd, on=["age_group_id", "sex_id", "location_id", "year_id", "cause_id"], how="left")
Beispiel #13
0
def main(data_dir, predicted_test_dir, int_cause, short_name,
         model_name, age_feature, dem_feature):
    """Summarize evaluation metrics across 500 test datasets
       Refit the classifier on all observed data
       Predict on the unobserved data
    """

    # determine the model's feature vector
    if age_feature:
        x_col = "cause_age_info"
    elif dem_feature:
        x_col = "dem_info"
    else:
        x_col = "cause_info"

    ## comment out for quick run
    ## summaries = read_in_summary_stats(predicted_test_dir)

    ## comment out for quick run
    ## summarize evaluation metrics across the datasets
    ## aggregate_evaluation_metrics(summaries, predicted_test_dir)

    # read in test df
    test_df = pd.read_csv(
        f"{data_dir}/test_df.csv")[DEM_COLS + ["cause_id",
                                               f"{x_col}",
                                               f"{int_cause}"]]
    # read in train df
    train_df = pd.read_csv(
        f"{data_dir}/train_df.csv")[DEM_COLS + ["cause_id",
                                                f"{x_col}",
                                                f"{int_cause}"]]
    print_log_message("read in train and test")
    # concat train/test to refit a model on all the observed data
    df = pd.concat([train_df, test_df], sort=True, ignore_index=True)

    print_log_message("reading in params df")
    param_df = pd.read_csv("/homes/agesak/thesis/maps/parameters.csv")
    param_df = param_df[[x for x in list(param_df) if short_name in x]]
    param_df[f"{short_name}"] = param_df[f"{short_name}"].str.replace(
        "clf__estimator__", "")
    ## comment out for quick run
    ## params = summaries.best_model_params.iloc[0]
    ## add for quick run
    params = get_best_fit(data_dir, short_name)

    # format best params to feed to classifier
    if isinstance(params, six.string_types):
        best_params = params.split("_")
    else:
        best_params = [params]

    param_kwargs = dict(zip(param_df.iloc[:, 0], best_params))
    if short_name == "nn":
        # these feed into create_neural_network
        hidden_nodes_1 = int(param_kwargs["hidden_nodes_1"])
        hidden_layers = int(param_kwargs["hidden_layers"])
        hidden_nodes_2 = int(param_kwargs["hidden_nodes_2"])
        # parameters with clf__ are only fed to keras classifier
        param_kwargs = {k: v for k, v in param_kwargs.items() if "clf__" in k}

    # ensure column dtypes are correct
    measure_dict = {"int": int, "float": float, "str": str}
    for key, value in param_kwargs.items():
        dtype = param_df.loc[param_df[
            f"{short_name}"] == key, f"{short_name}_dtype"].iloc[0]
        param_kwargs[key] = measure_dict[dtype](param_kwargs[key])

    # run Neural network separately because classifier
    # takes secondary arguments related to build
    if short_name == "nn":
        param_kwargs = {k.replace("clf__", ""): v for k,
                        v in param_kwargs.items() if "clf__" in k}
        cv = CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b")
        tf = cv.fit_transform(df[f"{x_col}"])
        print_log_message("converting to dense matrix")
        tf = tf.todense()
        # just hard code classifer name because this only works for keras
        model = KerasClassifier(build_fn=create_neural_network,
                                output_nodes=len(
                                    df.cause_id.unique()),
                                hidden_layers=hidden_layers,
                                hidden_nodes_1=hidden_nodes_1,
                                hidden_nodes_2=hidden_nodes_2, **param_kwargs)
        print_log_message("fitting KerasClassifier")
        model.fit(tf, df["cause_id"].values, **param_kwargs)
    else:
        # refit all other classifiers
        cv = CountVectorizer(lowercase=False)
        tf = cv.fit_transform(df[f"{x_col}"])
        print_log_message(f"fitting {model_name}")
        model = eval(model_name)(**param_kwargs).fit(tf, df["cause_id"])

    # now predict on the unobserved data
    print_log_message("reading in unobserved_df")

    unobserved_df = pd.read_csv(
        f"{data_dir}/int_cause_df.csv")[DEM_COLS + ["cause_id",
                                                    f"{x_col}",
                                                    f"{int_cause}"]]
    new_counts = cv.transform(unobserved_df[f"{x_col}"])
    if short_name == "nn":
        print_log_message("converting unobserved data to dense matrix")
        new_counts = new_counts.todense()
    unobserved_df["predictions"] = model.predict(new_counts)

    ## add for quick run
    makedirs_safely(predicted_test_dir)

    print_log_message("writing to df")
    unobserved_df.to_csv(f"{predicted_test_dir}/model_predictions.csv")
    joblib.dump(
        model, f"{predicted_test_dir}/model_fit.pkl")
    print_log_message("wrote model fit")
Beispiel #14
0
def main(best_model_dir, dataset_dir, testing_model_dir, best_model_params,
         int_cause, dataset_num, age_feature, dem_feature):
    """Predict on each test dataset
         Arguments:
              best_model_dir: directory that houses model object of best model
    dataset_dir: directory that houses the generated test datasets
                 (used for all classifiers)
    testing_model_dir: classifier-specific directory to write predictions
    best_model_params: (str) - the best model parameters
                        ex. formatted as param_param_param
    int_cause: the injuries garbage code of interest
    dataset_num: which dataset (of the 500) to work on
    age_feature: (Bool) - Do you want to include age as a feature?
    dem_feature: (Bool) - Do you want to include all demographic cols
                        (age, sex, year, and location) as features?
    """
    # determine the model's feature vector
    if age_feature:
        x_col = "cause_age_info"
    elif dem_feature:
        x_col = "dem_info"
    else:
        x_col = "cause_info"

    # read in model object of best models
    print_log_message("reading in grid results object")
    grid_results = joblib.load(f"{best_model_dir}/grid_results.pkl")

    # read in test dataset
    print_log_message("reading in data")
    dataset = pd.read_csv(f"{dataset_dir}/dataset_{dataset_num}.csv")

    # predit on test dataset
    print_log_message("predicting")
    dataset["predicted"] = grid_results.predict(dataset[f"{x_col}"])

    # determine values of evaluation metrics
    macro_precision = precision_score(y_true=dataset.cause_id,
                                      y_pred=dataset.predicted,
                                      average="macro")
    micro_precision = precision_score(y_true=dataset.cause_id,
                                      y_pred=dataset.predicted,
                                      average="micro")
    macro_recall = recall_score(y_true=dataset.cause_id,
                                y_pred=dataset.predicted, average="macro")
    micro_recall = recall_score(y_true=dataset.cause_id,
                                y_pred=dataset.predicted, average="micro")
    accuracy = accuracy_score(y_true=dataset.cause_id,
                              y_pred=dataset.predicted)
    cccsmfa = calculate_cccsmfa(y_true=dataset.cause_id,
                                y_pred=dataset.predicted)
    concordance = calculate_concordance(y_true=dataset.cause_id,
                                        y_pred=dataset.predicted,
                                        int_cause=int_cause)

    # save information about each prediction
    df = pd.DataFrame({"Concordance": [concordance],
                       "CCCSMFA": [cccsmfa],
                       "Macro Recall": [macro_recall],
                       "Micro Recall": [micro_recall],
                       "Macro Precision": [macro_precision],
                       "Micro Precision": [micro_precision],
                       "Accuracy": [accuracy],
                       "best_model_params": [best_model_params]})
    print_log_message("writing dfs")
    df.to_csv(
        f"{testing_model_dir}/dataset_{dataset_num}_summary_stats.csv",
        index=False)
Beispiel #15
0
    def get_computed_dataframe(self):

        keep_cols = self.df.columns

        if not self.country_needs_correction():
            print_log_message("Country doesn't need hiv correction")
            self.diag_df = None
            return self.df

        print_log_message("Getting rates df")
        rates_df = self.get_rates_df(self.cause_meta_df)
        if self.correct_garbage:
            df = add_code_metadata(self.df,
                                   add_cols=['value'],
                                   code_system_id=self.code_system_id,
                                   force_rerun=False,
                                   block_rerun=True,
                                   cache_dir=self.cache_dir)
            df = self.identify_sepsis_gc(df, self.code_system_id)
            df = self.identify_injury_gc(df, self.code_system_id)
            df = self.identify_hivrd_gc(df, self.code_system_id)
            group_cols = [
                x for x in keep_cols if x not in ['code_id', 'deaths']
            ]
            df_by_code = df.copy()
            df_by_cause = df.groupby(group_cols,
                                     as_index=False)['deaths'].sum()
        else:
            df_by_cause = self.df
        df = add_population(df_by_cause, pop_df=self.pop_df)
        print_log_message("Flagging correct dem groups for "
                          "{0} rows of data".format(len(df)))
        df = flag_correct_dem_groups(df,
                                     self.code_system_id,
                                     self.cause_meta_df,
                                     self.loc_meta_df,
                                     self.age_meta_df,
                                     rates_df,
                                     self.reference_ages,
                                     self.move_gc_age_restrictions,
                                     self.value_cols,
                                     self.pop_col,
                                     self.cause_selections_path,
                                     correct_garbage=self.correct_garbage)
        cause_to_targets_map = self.get_cause_to_targets_map(
            self.cause_meta_df)
        print_log_message("Identifying positive excess")
        df = identify_positive_excess(df, rates_df, cause_to_targets_map,
                                      self.reference_ages, self.loc_meta_df,
                                      self.cause_meta_df, self.value_cols,
                                      self.pop_col, self.correct_garbage)
        if self.correct_garbage:
            df = self.calculate_garbage_positive_excess(
                df, df_by_code, group_cols)
            print_log_message("Moving excess to target")
            df = move_excess_to_target(df, self.value_cols,
                                       cause_to_targets_map,
                                       self.correct_garbage)
            computed_df = assign_code_to_created_target_deaths(
                df, self.code_system_id, self.cause_meta_df)
        else:
            print_log_message("Moving excess to target")
            computed_df = move_excess_to_target(df, self.value_cols,
                                                cause_to_targets_map,
                                                self.correct_garbage)
        self.diag_df = computed_df
        return computed_df[keep_cols]
Beispiel #16
0
def run_pipeline(model, short_name, model_df, model_params, write_dir,
                 int_cause, age_feature, dem_feature):

    n_jobs_dict = {
        "nn": 2,
        "rf": -1,
        "xgb": -1,
        "bernoulli_nb": -1,
        "multi_nb": -1,
        "complement_nb": -1
    }

    if short_name == "svm_bag":
        model = {
            'model': BaggingClassifier,
            'kwargs': {
                'base_estimator': eval(model)()
            },
            'parameters': model_params
        }

        # create pipeline with bagging classifier
        pipeline = Pipeline([
            # token pattern allows pattern of length 1 character
            ("bow",
             CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b")),
            ('name', model['model'](**model['kwargs']))
        ])

        cv_params = model['parameters']
    elif short_name == "nn":

        hidden_layers = int(model_params["hidden_layers"])
        print_log_message(
            "deleting hidden layers from keras gridsearch params")
        del model_params["hidden_layers"]

        hidden_nodes_1 = int(model_params["hidden_nodes_1"])
        print_log_message(
            "deleting hidden nodes 1 from keras gridsearch params")
        del model_params["hidden_nodes_1"]

        if hidden_layers > 1:
            hidden_nodes_2 = int(model_params["hidden_nodes_2"])
            print_log_message(
                "deleting hidden nodes 2 from keras gridsearch params")
        else:
            hidden_nodes_2 = None
        del model_params["hidden_nodes_2"]

        pipeline = Pipeline([
            ("bow", CountVectorizer(lowercase=False)),
            ("dense",
             FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
            ("clf",
             KerasClassifier(build_fn=create_neural_network,
                             output_nodes=len(model_df.cause_id.unique()),
                             hidden_layers=hidden_layers,
                             hidden_nodes_1=hidden_nodes_1,
                             hidden_nodes_2=hidden_nodes_2))
        ])

        cv_params = model_params.copy()
        print_log_message(f"cv_params are {cv_params}")

    else:
        pipeline = Pipeline([("bow", CountVectorizer(lowercase=False)),
                             ("clf", ClfSwitcher())])

        model_params.update({"clf__estimator": [eval(model)()]})
        cv_params = model_params.copy()

    scorer_list = create_custom_scorers(int_cause)

    scoring = {
        "macro_precision": scorer_list[0],
        "micro_precision": scorer_list[1],
        "macro_recall": scorer_list[2],
        "micro_recall": scorer_list[3],
        "accuracy": scorer_list[4],
        "cccsmfa": scorer_list[5],
        "concordance": scorer_list[6]
    }

    print_log_message("creating gridsearch object")
    gscv = GridSearchCV(pipeline,
                        cv_params,
                        cv=5,
                        scoring=scoring,
                        n_jobs=n_jobs_dict[short_name],
                        pre_dispatch=6,
                        refit="concordance",
                        verbose=6)

    print_log_message("fitting model")
    if age_feature:
        grid_results = gscv.fit(model_df["cause_age_info"],
                                model_df["cause_id"])
    elif dem_feature:
        grid_results = gscv.fit(model_df["dem_info"], model_df["cause_id"])
    else:
        grid_results = gscv.fit(model_df["cause_info"], model_df["cause_id"])

    print_log_message("saving model results")
    results = pd.DataFrame.from_dict(grid_results.cv_results_)
    return results, grid_results
Beispiel #17
0
    def get_computed_dataframe(self):
        """Return computations."""
        split_type = self.needs_splitting()
        if not split_type:
            print_log_message("No location splitting required.")
            self.diag_df = None
            return self.df

        # quick check that sample size has been created
        assert 'sample_size' in self.df.columns

        # grab total deaths of incoming data
        start_deaths = (self.df.sample_size * self.df.cf).sum()

        if split_type == "UKR":
            # prep envelope df
            env_wide = self.prep_envelope(split_type)

            # split data
            df = self.adjust_ukr(env_wide, split_type)

            # append ukraine w/o crimea/sevastopol to incoming data
            # national data not modeled, but needed for CodViz
            df = pd.concat([df, self.df], ignore_index=True)

            # expected result is national data + national w/o crimea/sevastopol
            # plus crimea, plus sevastopol
            end_deaths = (df.sample_size * df.cf).sum()
            assert np.isclose((end_deaths / start_deaths), 2, atol=0.05)
        else:
            df_list = []
            for split_type in ["urban", "rural"]:
                # prep envelope df
                env_wide = self.prep_envelope(split_type)

                # get location_ids to split
                orig_id = self.split_ids[split_type][0]
                new_id = self.split_ids[split_type][1]

                # separate out andhra pradesh data
                df_ap = self.df.loc[self.df['location_id'] == orig_id]

                # do location splittling
                df = self.adjust_ap_telangana(orig_id, new_id, env_wide, df_ap)

                df_list.append(df)

            # remove ap from incoming data
            ap_ids = [self.split_ids["urban"][0]
                      ] + [self.split_ids["rural"][0]]
            df_no_ap = self.df.loc[~(self.df['location_id'].isin(ap_ids))]
            df_list.append(df_no_ap)

            df = pd.concat(df_list, ignore_index=True)

            # compare start and end deaths
            end_deaths = (df.sample_size * df.cf).sum()
            assert np.isclose(start_deaths, end_deaths, rtol=0.001)

        # make sure not to add any new columns
        df = df[self.orig_cols]

        return df