def get_acause(model_version_id, connection): call = ''' SELECT cause_id FROM cod.model_version WHERE model_version_id = {model_version_id} '''.format(model_version_id=model_version_id) cause_id = query(call, connection)["cause_id"][0] call = ''' SELECT acause from shared.cause WHERE cause_id = {cause_id} '''.format(cause_id=cause_id) acause = query(call, connection)["acause"][0] return acause
def new_covariates(old_model_version_id, new_model_version_id, db_connection): ''' (int, int) -> None Given two integers representing a previous model version ID used for CODEm and a new model, update the new model to use the newest version of the covariates by inputting those values in the production database. ''' call = ''' SELECT * FROM cod.model_covariate where model_version_id = {model} '''.format(model=old_model_version_id) df = db_connect.query(call, db_connection) df["model_version_id"] = new_model_version_id df["covariate_model_version_id"] = \ df.covariate_model_version_id.map(lambda x: get_latest_covariate(x, db_connection)) drop_cols = [ "model_covariate_id", "date_inserted", "inserted_by", "last_updated", "last_updated_by", "last_updated_action" ] df.drop(drop_cols, axis=1, inplace=True) db_connect.write_df_to_sql(df, db="cod", table="model_covariate", connection=db_connection, creds=None)
def update_status(status, model_version_id, connection): ''' :param status: int the target status, e.g. 0 for pending, 1 for success, 7 for failure :param model_version_id: int which model to update the status of :param connection: string the database (server) to connect to :return: None ''' call = ''' UPDATE cod.model_version SET status = {status} WHERE model_version_id = {mvid} '''.format(status=status, mvid=model_version_id) query(call, connection)
def get_acause(model_version_id, db_connection): """ :param model_version_id: integer representing a codem model version id :return: cause Given a model version id returns the corresponding cause. """ call = ''' SELECT cause_id FROM cod.model_version WHERE model_version_id = {model_version_id} '''.format(model_version_id=model_version_id) cause_id = np.array(query(call, db_connection))[0, 0] call = ''' SELECT acause FROM shared.cause WHERE cause_id = {cause_id} '''.format(cause_id=cause_id) acause = np.array(query(call, db_connection))[0, 0] return acause
def location_ids_for_covariate(covariate_model_version_id, db_connection): call = ''' SELECT DISTINCT location_id FROM covariate.model WHERE model_version_id = {covariate_model_version_id} ORDER BY location_id '''.format(covariate_model_version_id=covariate_model_version_id) locations = np.array(query(call, db_connection))[:, 0] return locations
def covariate_id_to_short_name(covariate_model_version_id, db_connection): call = ''' SELECT covariate_name_short FROM shared.covariate WHERE covariate_id IN (SELECT covariate_id from covariate.data_version WHERE data_version_id IN (SELECT data_version_id FROM covariate.model_version WHERE model_version_id = {covariate_model_version_id})) '''.format(covariate_model_version_id=covariate_model_version_id) covariate_short_name = np.array(query(call, db_connection))[0, 0] return covariate_short_name
def set_rerun_models(list_of_models, gbd_round_id, db_connection, desc=None): """ set of codem models based on past model version id :param list_of_models: list like list like object of model version ids :param desc: str string to use for description name """ models_string = "(" + ",".join([str(l) for l in set(list_of_models)]) + ")" branch = "master" drop_cols = [ "model_version_id", "date_inserted", "inserted_by", "last_updated", "last_updated_by", "last_updated_action" ] null_cols = [ "pv_rmse_in", "pv_rmse_out", "pv_coverage_in", "pv_coverage_out", "pv_trend_in", "pv_trend_out", "pv_psi", "best_start", "best_end", ] call = ''' SELECT * FROM cod.model_version WHERE model_version_id IN {ms} '''.format(ms=models_string) df = db_connect.query(call, db_connection) df["description"] = desc df["code_version"] = '{"branch": "%s", "commit": "%s"}' % ( branch, get_current_commit(branch)) df[["status", "is_best"]] = 0 df["previous_model_version_id"] = df["model_version_id"] df["location_set_version_id"] = current_location_set_id( gbd_round_id, db_connection) df["cause_set_version_id"] = current_cause_set_id(gbd_round_id, db_connection) df["gbd_round_id"] = gbd_round_id df["locations_exclude"] = get_excluded_locations(df) # for right now replace oldest age group 21 with new oldest age group # we probably want to change this in the future df.loc[df.age_end == 21, "age_end"] = 235 df[null_cols] = np.NaN df.drop(drop_cols, axis=1, inplace=True) models = db_connect.write_df_to_sql(df, db="cod", table="model_version", connection=db_connection, creds=None, return_key=True) return models
def cause_list(db_connection): """ Pull the cause list from the database. """ call = ''' SELECT cause_id, cause_name FROM shared.cause; ''' data = query(call, db_connection) data = data.rename(columns={"cause_name": "Cause"}) return data
def age_list(db_connection): """ Pull the age list from the database. """ call = ''' SELECT age_group_id, age_group_alternative_name as age_name FROM shared.age_group; ''' data = query(call, db_connection) return data
def get_latest_covariate(model_version_id, acause, db_connection, gbd_round_id): ''' Given a covariate model version id find the best model version id for that covariate. :param model_version_id: int model version id to look up and create duplicate of :param gbd_round_id: int gbd round id of the CURRENT model run :return: int new model version id ''' covariate_id = get_covariate_id(model_version_id) # really sloppy stuff for covariate deprecation if covariate_id == 1: covariate_id = 2 if covariate_id == 41: if "nutrition" not in acause: covariate_id = 911 else: covariate_id = 914 if covariate_id == 78: if "nutrition" not in acause: covariate_id = 917 else: covariate_id = 920 if covariate_id == 103: covariate_id = 105 if covariate_id == 104: covariate_id = 105 if covariate_id == 136: if "nutrition" not in acause: covariate_id = 938 else: covariate_id = 941 if covariate_id == 153: if "nutrition" not in acause: covariate_id = 965 else: covariate_id = 968 if covariate_id in [9, 52, 81, 95, 150, 971]: covariate_id = None if covariate_id is not None: call = ''' SELECT model_version_id FROM covariate.model_version WHERE data_version_id IN (SELECT data_version_id FROM covariate.data_version WHERE covariate_id={cid}) AND is_best=1 AND gbd_round_id = {gbd} '''.format(cid=covariate_id, gbd=gbd_round_id) new_model = db_connect.query(call, 'DATABASE')["model_version_id"][0] else: new_model = np.nan return new_model
def get_codem_version(model_version_id, db_connection): ''' (str) -> str, str Given a model version id gets the branch and commit of the code ''' call = "SELECT code_version FROM cod.model_version WHERE model_version_id = {0}" result = json.loads( db_connect.query(call.format(model_version_id), connection=db_connection)["code_version"][0]) return result["branch"], result["commit"]
def show_model_status(model_version_id, db_conn="INSERT_DATABASE_NAME"): """ Pulls the logs from the DB of a model run. """ call = ''' SELECT model_version_log_entry, date_inserted FROM cod.model_version_log WHERE model_version_id = {} ORDER BY date_inserted DESC '''.format(model_version_id) df = db_connect.query(call, db_conn) return df
def get_model_covariates(model_version_id, db_connection): """ :param model_version_id: :return: """ call = ''' SELECT covariate_model_version_id FROM cod.model_covariate WHERE model_version_id = "{model_version_id}" '''.format(model_version_id=model_version_id) covariate_ids = np.array(query(call, db_connection))[:, 0] return covariate_ids
def get_modeled_locations(self): """ """ loc_ids = tuple(self.df.location_id.unique()) call = """ SELECT location_name FROM shared.location WHERE location_id in {} """.format(loc_ids) loc_names = db_connect.query(call, self.db_connection).values.tolist() return pd.DataFrame({"id": loc_ids, "name": loc_names})
def get_submodels(model_version_id, db_connection): """ :param model_version_id: integer representing a codem model version id :return: Pandas Data frame with submodels and corresponding information """ call = ''' SELECT submodel_version_id, rank, weight, submodel_type_id, submodel_dep_id FROM cod.submodel_version WHERE model_version_id = {model_version_id} '''.format(model_version_id=model_version_id) df = query(call, db_connection) return df
def get_modeler(acause, connection): ''' string -> string Look up the modeler for the cause being modeled in the cod.modeler table ''' call = """ SELECT m.username FROM cod.modeler m INNER JOIN shared.cause c ON m.cause_id = c.cause_id WHERE c.acause = '{acause}' """.format(acause=acause) modeler = query(call, connection) return modeler.ix[0, 0].split(', ')
def get_broken_models(db_connection): ''' () -> 1d-array Gets all the runs of codem that that are on the production database that failed the run and returns their model version id numbers in an array. ''' call = ''' SELECT model_version_id FROM cod.model_version WHERE description = "2015 full run test 6/10" AND pv_rmse_in IS NULL; ''' broken_models = np.array(query(call, db_connection)) return sorted(broken_models[:, 0])
def all_submodel_covs(model_version_id, db_connection): """ :param model_version_id: integer representing a codem model version id :return: Pandas Data frame with submodels, covariates, and corresponding information """ submodels = get_submodels(model_version_id, db_connection) covs = pd.concat([submodel_covs(x) for x in submodels.submodel_version_id], axis=0).reset_index(drop=True) df = covs.merge(submodels, how="left") df = df.sort(["rank", "covariate_name_short"]) call = ''' SELECT submodel_type_id, submodel_type_name FROM cod.submodel_type; ''' df2 = query(call, db_connection) df = df.merge(df2, how="left") call = ''' SELECT submodel_dep_id, submodel_dep_name FROM cod.submodel_dep; ''' df2 = query(call, db_connection) df = df.merge(df2, how="left") df.drop(["submodel_type_id", "submodel_dep_id"], inplace=True, axis=1) df = df.sort(["rank", "covariate_name_short"]) df["approximate_draws"] = np.round(df.weight.values * 1000.) return df
def current_cause_set_id(gbd_round_id, db_connection): """ Looks up the latest 2015 cause set id for codem :return: int latest 2015 cause set id """ call = ''' SELECT cause_set_version_id FROM shared.cause_set_version_active WHERE cause_set_id = 4 # referes to the set of causes specific to codem AND gbd_round_id = {}; '''.format(gbd_round_id) cause_set_version_id = \ db_connect.query(call, db_connection)["cause_set_version_id"][0] return cause_set_version_id
def current_location_set_id(gbd_round_id, db_connection): """ Looks up the latest 2015 location set id for codem :return: int latest 2015 location set id """ call = ''' SELECT location_set_version_id FROM shared.location_set_version_active WHERE location_set_id = 35 # this location set id refres specifically to the hierarchy that codem uses AND gbd_round_id = {}; '''.format(gbd_round_id) location_set_version_id = \ db_connect.query(call, db_connection)["location_set_version_id"][0] return location_set_version_id
def get_model_metadata(model_version_ids, db): model_list = ', '.join([str(model) for model in model_version_ids]) call = ''' SELECT mv.model_version_id, mv.model_version_type_id, mv.sex_id, mv.cause_id, sc.cause_name, mv.age_start, mv.age_end FROM cod.model_version mv INNER JOIN shared.cause sc ON mv.cause_id = sc.cause_id WHERE mv.model_version_id IN ({mvids}) '''.format(mvids=model_list) print call df = db_connect.query(call, db) return df
def best_model_covs(acause, db_connection, year=2015): """ :param acause: string representing a valid acause :return: Pandas Data frame with submodels, covariates, and corresponding information Given a string that represents a valid acause in the GBD heirarchy retruns the best models submodel covariates as well as metadata regarding those models. """ call = ''' SELECT model_version_id FROM cod.model_version WHERE cause_id=(SELECT cause_id FROM shared.cause WHERE acause="{acause}") AND date_inserted > '{year}-01-01' ORDER BY pv_rmse_out limit 1 '''.format(acause=acause, year=year) model_version_id = np.array(query(call, db_connection))[0, 0] return all_submodel_covs(model_version_id, db_connection)
def get_broken_models(db_connection, description="relaunch new CoD data"): """ () -> 1d-array Gets all the runs of codem that that are on the production database that failed the 2015 run and returns their model version id numbers in an array. """ call = ''' SELECT model_version_id FROM cod.model_version WHERE (description = "{description}" OR description LIKE "%%{description}:%%") AND pv_rmse_in IS NULL; '''.format(description=description) broken_models = np.array(query(call, db_connection)) return sorted(broken_models[:, 0])
def check_model_count(model_version_id, db_connection, broken=False): ''' (int) -> int Returns the numne ''' base = "PATH_TO_CV_SELECTED" if broken: number = len(import_json(base)["cf_vars"]) + len( import_json(base)["rate_vars"]) * 2 else: call = ''' SELECT submodel_version_id FROM cod.submodel_version WHERE model_version_id={model_version_id}; '''.format(model_version_id=model_version_id) df = query(call, db_connection) number = df.shape[0] return number
def get_covariate_id(model_version_id): """Given a covariate model version ID find the covariate ID.""" call = ''' SELECT DISTINCT sc.covariate_id, sc.covariate_name FROM covariate.model_version cmv INNER JOIN covariate.data_version cdv ON cdv.data_version_id = cmv.data_version_id INNER JOIN shared.covariate sc ON sc.covariate_id = cdv.covariate_id WHERE cmv.model_version_id = {mvid} '''.format(mvid=model_version_id) covariate_id = db_connect.query(call, 'DATABASE')["covariate_id"][0] return covariate_id
def get_acause(model_version_id, db_connection): """ Get acause from CODEm model version ID :param model_version_id: :return: str with acause """ call = ''' SELECT sc.acause AS acause FROM shared.cause sc INNER JOIN cod.model_version mv ON sc.cause_id = mv.cause_id WHERE mv.model_version_id = {mvid} '''.format(mvid=model_version_id) acause = db_connect.query(call, db_connection)["acause"][0] return acause
def set_new_covariates(models, db_connection): """ Sets the covariates for all the new models using their prior selected covariates :param models: list of int list of models to add covariates for :param db_connection: str db to connect to """ model_str = ', '.join([str(x) for x in models]) call = ''' SELECT model_version_id, previous_model_version_id FROM cod.model_version WHERE model_version_id IN ({model_str}); '''.format(model_str=model_str) df = db_connect.query(call, db_connection) for i in range(df.shape[0]): new_covariates(df["previous_model_version_id"][i], df["model_version_id"][i], db_connection)
def submodel_covs(submodel_version_id, db_connection): """ :param submodel_version_id: integer representing a codem submodel version id :return: Pandas data frame with information on submodel covariates Given a submodel version id returns the covariates that were used in the construction of that model. """ call = ''' SELECT covariate_name_short FROM shared.covariate WHERE covariate_id IN (SELECT covariate_id from covariate.data_version WHERE data_version_id IN (SELECT data_version_id FROM covariate.model_version WHERE model_version_id IN (SELECT covariate_model_version_id FROM cod.submodel_version_covariate WHERE submodel_version_id={submodel_version_id}))) '''.format(submodel_version_id=submodel_version_id) df = query(call, db_connection) df["submodel_version_id"] = submodel_version_id return df
def remove_covariates(df, covariates, db_connection): ''' (df, list) -> df Removes the rows containing the input covariates from the input df. The covariates in the df are stored as covariate model version ids, and the input covariates should be in the covariate_name_short form ''' call = '''SELECT mv.model_version_id FROM covariate.model_version mv INNER JOIN covariate.data_version dv ON mv.data_version_id = dv.data_version_id INNER JOIN shared.covariate sc ON dv.covariate_id = sc.covariate_id WHERE sc.covariate_name_short = "{}"''' for covariate in covariates: cov_mvids = db_connect.query(call.format(covariate), db_connection) cov_mvids = cov_mvids.model_version_id.values df = df[~df.covariate_model_version_id.isin(cov_mvids)] return df
def get_latest_covariate(model_version_id, db_connection): ''' Given a covariate model version id find the best model version id for that covariate. :param model_version_id: int model version id to look up and create duplicate of :return: int new model version id ''' call = ''' SELECT model_version_id FROM covariate.model_version WHERE data_version_id IN (SELECT data_version_id FROM covariate.data_version WHERE covariate_id=(SELECT covariate_id FROM covariate.data_version WHERE data_version_id =(SELECT data_version_id FROM covariate.model_version WHERE model_version_id={model}))) AND is_best=1; '''.format(model=model_version_id) new_model = db_connect.query(call, db_connection)["model_version_id"][0] return new_model