def log_branch(model_version_id, db_connection, branch_name=None): """ Logs which branch a model is running on. Only should log if it wasn't submitted via CodViz. If it already has a branch, check to make sure it's running in the right environment. :param model_version_id: (int) model version ID :param db_connection: (str) database connection :param branch_name: (str) optional branch name to use for testing :return: """ if branch_name is None: branch_name = get_current_branch() logged_branch = get_logged_branch(model_version_id, db_connection) if logged_branch is None: current_version = '{"branch": "' + branch_name + '"}' update_call = f"UPDATE cod.model_version SET code_version = '{current_version}' " \ f"WHERE model_version_id = {model_version_id}" db_connect.query(update_call, connection=db_connection) else: logger.info(f"This model is running under {branch_name}.") if logged_branch != branch_name: raise RuntimeError( f"This model is running under {branch_name} but it should be" f" running under {logged_branch} according to cod.model_version table." ) return branch_name
def change_model_status(model_version_id, status, db_connection): if model_version_id is None: raise ValueError("Cannot pass a model version ID of None.") call = """ UPDATE cod.model_version SET status = {} WHERE model_version_id = {} """.format(status, model_version_id) db_connect.query(call, db_connection)
def time_stamp(self, procedure): ''' (self, str) -> None Write to the database when a particular part of the procedure has started so that it can be read by the CodViz tool. This enables users to see what stage a model is currently at (i.e. covariate selection, linear model building, space time smoothing, gaussian process, etc.). ''' phrase = "%s started." % procedure call = QS.status_write.format(self.model_version_id, phrase) db_connect.query(call, self.db_connection) self.add_warning(phrase)
def write_submodel(model_version_id, submodel_type_id, submodel_dep_id, weight, rank, db_connection): """ (int, int, int, float, int) -> int Write a submodel to the table and get the id back """ call = QS.submodel_query_str.format(model_version_id, submodel_type_id, submodel_dep_id, weight, rank) db_connect.query(call, db_connection) call = QS.submodel_get_id.format(model_version_id, rank) submodel_id_df = db_connect.query(call, db_connection) submodel_id = submodel_id_df["submodel_version_id"][0] return submodel_id
def get_old_covariates(old_model_version_id, db_connection, gbd_round_id, decomp_step_id, delete_covs=None): """ Get existing covariate data frame for old model version. :param old_model_version_id: int :param db_connection: str :param gbd_round_id: int :param decomp_step_id: int :return: """ call = '''SELECT * FROM cod.model_covariate where model_version_id = {model} '''.format(model=old_model_version_id) df = db_connect.query(call, db_connection) df["covariate_id"] = df.covariate_model_version_id.map( lambda x: get_covariate_id(x)) if delete_covs is not None: df = df.loc[~df.covariate_id.isin(delete_covs)] df["covariate_model_version_id"] = df.covariate_id.map( lambda x: get_latest_covariate(x, gbd_round_id, decomp_step_id)) df.drop([ "model_covariate_id", "date_inserted", "inserted_by", "last_updated", "last_updated_by", "last_updated_action" ], axis=1, inplace=True) return df
def get_central_run_models(central_run, wave, gbd_round_id, decomp_step_id, db_connection, status=None): """ Gets the models run during the central run, looking by description. Used to see if models are still pending or finished/failed. :param central_run: :param wave: :param gbd_round_id: :param decomp_step_id: (int) :param status: list of int :param db_connection: (str) :return: """ call = ''' SELECT model_version_id, status, description FROM cod.model_version WHERE gbd_round_id = {gbd} AND description like "central codem run {run}, wave {wave}%%" AND decomp_step_id = {decomp_step_id} '''.format(gbd=gbd_round_id, run=central_run, wave=wave, decomp_step_id=decomp_step_id) if status is not None: call = call + ' AND status IN ({})'.format(', '.join( [str(x) for x in status])) mvids = query(call, db_connection) return mvids
def get_submodel_summary(model_version_id, db_connection): """ Retrieves the summary submodel rank table for a particular model. """ logger.info("Making submodel summary table.") call = QS.submodel_summary_query.format(model_version_id) df = db_connect.query(call, db_connection) return df
def get_modeler(cause_id, gbd_round_id, db_connection): """ Gets the modelers from the cod.modeler table. """ call = f'''SELECT username FROM cod.modeler WHERE cause_id = '{cause_id}' AND gbd_round_id = {gbd_round_id}''' modeler = db_connect.query(call, db_connection) return modeler.ix[0, 0].split(', ')
def get_covariate_metadata(model_version_id, db_connection): """ integer -> Pandas data frame Given an integer that represents a valid model ID number, will return a pandas data frame which contains the covariate model ID's for that model as well as the metadata needed for covariate selection. Note: this was re-worked to call the covariate prior information from the cod database (either dev or prod) and then covariate names from ADDRESS because the model versions must be merged with many tables only available in the covariate database. """ df = db_connect.query(QS.metaQueryStr.format(mvid=model_version_id), db_connection) models = df.covariate_model_id.values.tolist() names = db_connect.query( QS.covNameQueryStr.format(', '.join((str(x) for x in models))), 'ADDRESS') df = df.merge(names, on='covariate_model_id') return df
def get_run_id(process_version_id): """ Query the mortality database to get the run_id associated :param process_version_id: (int) process version ID for mortality DB :return: (int) run_id """ call = "SELECT run_id FROM mortality.process_version WHERE proc_version_id = {}" run_id = db_connect.query(call.format(process_version_id), 'ADDRESS')['run_id'][0] return run_id
def get_refresh_id(decomp_step_id, db_connection): """ Function to pull the best refresh ID for a given decomp step. :param decomp_step_id: (int) decomp step ID :param db_connection: (str) database to connect to :return: """ call = ''' SELECT refresh_id FROM cod.decomp_refresh_version WHERE decomp_step_id = {} AND is_best = 1'''.format(decomp_step_id) refresh_id = db_connect.query(call, db_connection)['refresh_id'][0] return refresh_id
def get_cod_data(cause_id, sex, start_year, start_age, end_age, location_set_version_id, refresh_id, outlier_decomp_step_id, db_connection, model_version_id, gbd_round_id, outlier_model_version_id): """ strings indicating model parameters -> Pandas Data Frame Given a list of model parameters will query from the COD database and return a pandas data frame. The data frame contains the base variables used in the CODEm process. Also will call the outlier stored procedure in the database to save model outliers if """ logger.info( f"Querying cod data for refresh {refresh_id} and decomp {outlier_decomp_step_id} outliers." ) if not exists_in_outlier_history(model_version_id=model_version_id, connection=db_connection): if model_version_id in outlier_model_version_id: logger.info( f"Running the outlier stored procedure for decomp_step_id {outlier_decomp_step_id}" ) save_model_outliers(model_version_id=model_version_id, gbd_round_id=gbd_round_id, decomp_step_id=outlier_decomp_step_id, connection=db_connection) else: for out in outlier_model_version_id: logger.info( f"Running the outlier stored procedure to copy outliers from" f"{out} to {model_version_id}") copy_model_outliers(old_model_version_id=out, new_model_version_id=model_version_id, connection=db_connection) else: logger.warning( "The outlier model version already exists in the table, therefore" "we aren't copying it over.") pass logger.info(f"Querying cod data for refresh {refresh_id}.") call = QS.codQueryStr.format(c=cause_id, s=sex, sy=start_year, sa=start_age, ea=end_age, loc_set_id=location_set_version_id, rv=refresh_id, model_version_id=model_version_id) df = db_connect.query(call, db_connection) df['national'] = df['national'].map(lambda x: x == 1).astype(int) return df
def outlier_decomp_step_from_decomp_step(decomp_step_id, db_connection): """ Get the outlier step associated with decomp step ID passed. :param decomp_step_id: (int) :param db_connection: (str) :return: """ call = ("SELECT outlier_decomp_step_id AS o " "FROM cod.decomp_outlier_step " f"WHERE decomp_step_id = {decomp_step_id}") outlier_decomp_step_id = db_connect.query(call, db_connection)['o'][0] return outlier_decomp_step_id
def current_cause_set_id(gbd_round_id, db_connection): """ Looks up the latest 2015 cause set id for codem :return: int latest 2015 cause set id """ call = ''' SELECT cause_set_version_id FROM shared.cause_set_version_active WHERE cause_set_id = 4 # referes to the set of causes specific to codem AND gbd_round_id = {}; '''.format(gbd_round_id) cause_set_version_id = \ db_connect.query(call, db_connection)["cause_set_version_id"][0] return cause_set_version_id
def current_location_set_id(gbd_round_id, db_connection): """ Looks up the latest 2015 location set id for codem :return: int latest 2015 location set id """ call = ''' SELECT location_set_version_id FROM shared.location_set_version_active WHERE location_set_id = 35 # this location set id refres specifically to the hierarchy that codem uses AND gbd_round_id = {}; '''.format(gbd_round_id) location_set_version_id = \ db_connect.query(call, db_connection)["location_set_version_id"][0] return location_set_version_id
def get_logged_branch(model_version_id, db_connection): """ Get the logged branch in the database. :param model_version_id: :param db_connection: :return: """ call = f"SELECT code_version FROM cod.model_version WHERE model_version_id = {model_version_id}" logged_version = db_connect.query( call, connection=db_connection)['code_version'][0] if logged_version is None or logged_version == 'null': branch = None else: branch = json.loads(logged_version)['branch'] return branch
def get_mods_info(models, db_connection): """ gets the cause_id, cause_name, modeler, inserted_by, model_version_type, and sex_id for each model in 'models' from the database 'db' :param models: list of ints :param db_connection: string """ call = ''' SELECT cmv.cause_id, sc.cause_name, sc.acause, cmv.model_version_id, cmv.sex_id, cmv.model_version_type_id, cmv.age_start, cmv.age_end, cmv.description, cmv.status, c.model_version_log_entry, c.date_inserted FROM cod.model_version cmv INNER JOIN (SELECT cml.model_version_log_entry, cml.model_version_id, cml.date_inserted FROM cod.model_version_log cml JOIN (SELECT model_version_id, MAX(date_inserted) ins FROM cod.model_version_log GROUP BY model_version_id) l ON l.model_version_id = cml.model_version_id AND l.ins = cml.date_inserted) c ON c.model_version_id = cmv.model_version_id INNER JOIN shared.cause sc ON sc.cause_id = cmv.cause_id WHERE cmv.model_version_id IN ( {} ) '''.format(', '.join([str(x) for x in models])) df = query(call, db_connection) return df
def exists_in_outlier_history(model_version_id, connection): """ Check to see if this model version already exists in the outlier history table. :param model_version_id: (int) :param connection: (str) :return: """ logger.info( f"Checking to make sure that {model_version_id} does not exist in the outlier history table." ) call = f"SELECT COUNT(*) AS count FROM cod.outlier_history WHERE model_version_id = {model_version_id}" count = db_connect.query(call, connection=connection)['count'][0] if count: logger.info( f"The model version {model_version_id} already exists in the outlier history table." ) return count
def get_covariate_id(model_version_id): """Given a covariate model version ID find the covariate ID.""" logger.info( "Getting covariate ID for model version {}".format(model_version_id)) call = ''' SELECT DISTINCT sc.covariate_id, sc.covariate_name FROM covariate.model_version cmv INNER JOIN shared.covariate sc ON sc.covariate_id = cmv.covariate_id WHERE cmv.model_version_id = {mvid} '''.format(mvid=model_version_id) covariate_id = db_connect.query(call, 'ADDRESS')["covariate_id"][0] return covariate_id
def get_latest_covariate(covariate_id, gbd_round_id, decomp_step_id): """ Given a covariate ID find the best covariate_model_version_id for that covariate. :param covariate_id: int covariate id to look up :param gbd_round_id: int gbd round id of the CURRENT model run :param decomp_step_id: int decomposition step ID :return: int new model version id """ if not is_valid_covariate(covariate_id): new_model = np.nan elif covariate_id is not None: call = ''' SELECT mv.model_version_id FROM covariate.model_version mv JOIN covariate.decomp_model_version dmv ON mv.model_version_id = dmv.model_version_id WHERE mv.covariate_id = {cid} AND mv.gbd_round_id = {gbd} AND dmv.decomp_step_id = {dsid} AND dmv.is_best = 1 '''.format(cid=covariate_id, gbd=gbd_round_id, dsid=decomp_step_id) try: new_model = db_connect.query(call, 'ADDRESS')["model_version_id"][0] except IndexError: logger.info( "There is NO best covariate for GBD round ID {} for covariate ID {}" "and decomp step {}".format(gbd_round_id, covariate_id, decomp_step_id)) raise RuntimeError( "Cannot run model because there is no best covariate ID {} for " "GBD round ID {} and decomp step {}".format( covariate_id, gbd_round_id, decomp_step_id)) else: new_model = np.nan return new_model
def check_if_new_cause_for_step_3(cause_id, gbd_round_id, db_connection): """ Check if something is a new cause being modeled by CODEm during the methods change step for decomp (decomp_step_id=3) :param cause_id: (int) :param gbd_round_id (int) :param db_connection (str) :return: (bool) """ call = (f'SELECT COUNT(*) AS count ' f'FROM cod.model_version ' f'WHERE cause_id = {cause_id} ' f'AND gbd_round_id = {gbd_round_id} ' f'AND model_version_type_id IN (0, 1, 2) ' f'AND decomp_step_id IN (1, 2)') count = db_connect.query(call, db_connection)['count'][0] return count == 0
def create_age_df(db_connection): """ None -> Pandas data frame Creates a Pandas data frame with two columns, all the age groups currently used in analysis at IHME as noted by the data base as well as a column with the code used for the aggregate group. """ # this age_group_set_id is currently specific to gbd 2016 call = """ SELECT age_group_id as all_ages FROM shared.age_group_set_list WHERE age_group_set_id = 12 AND is_estimate = 1; """ age_df_22 = db_connect.query(call, db_connection) age_df_27 = age_df_22.copy(deep=True) age_df_22['age'] = 22 age_df_27['age'] = 27 return pd.concat([age_df_22, age_df_27], ignore_index=True)
def get_hybrids_to_launch(central_run, decomp_step_id, gbd_round_id, sex_id, db_connection='ADDRESS'): """ Get all hybrids that need to be run :param central_run: (int) :param decomp_step_id: (int) :param gbd_round_id: (int) :param sex_id: (int) only this because we want to run multiple workflows at same time :param db_connection: (str) :return: """ call = f''' SELECT MAX(cmv.model_version_id) AS model_version_id, cmv.cause_id, cmv.sex_id, cmv.model_version_type_id, cmv.age_start, cmv.age_end FROM cod.model_version cmv WHERE cmv.model_version_id NOT IN ( SELECT cmr.child_id FROM cod.model_version_relation cmr INNER JOIN cod.model_version cmvv ON cmvv.model_version_id = cmr.parent_id WHERE cmvv.status = 1 ) AND cmv.description LIKE "central codem run {central_run}%%" AND cmv.gbd_round_id = {gbd_round_id} AND cmv.decomp_step_id = {decomp_step_id} AND cmv.status = 1 AND cmv.sex_id = {sex_id} GROUP BY cmv.cause_id, cmv.sex_id, cmv.model_version_type_id, cmv.age_start, cmv.age_end ''' df = query(call, db_connection) df = df.pivot_table(index=['cause_id', 'sex_id', 'age_start', 'age_end'], columns=['model_version_type_id'], values=['model_version_id']) df.columns = df.columns.droplevel() df.reset_index(inplace=True) df.columns = [ 'cause_id', 'sex_id', 'age_start', 'age_end', 'global', 'data_rich' ] valid_df = df.loc[(~df['global'].isnull()) & (~df['data_rich'].isnull())] return valid_df
def check_sex_restrictions(cause_id, sex_id, gbd_round_id=5, db_connection='ADDRESS'): sex = {1: 'male', 2: 'female'}[sex_id] call = ''' SELECT {} FROM shared.cause_hierarchy_history ch INNER JOIN shared.cause_set_version_active ca ON ca.cause_set_version_id = ch.cause_set_version_id WHERE cause_id = {} AND ca.gbd_round_id = {} AND ca.cause_set_id = 3 '''.format(sex, cause_id, gbd_round_id) valid = query(call, db_connection)[sex].iloc[0] if valid is None: valid = 0 return bool(valid)
def submodel_rank(self): """ Pandas data frame Returns a dataframe of the submodels for the model version id, ranked by weight. """ call = '''SELECT submodel_version_id, submodel_type_id, submodel_dep_id, weight, rank FROM cod.submodel_version WHERE model_version_id = {} ''' call = call.format(self.model_version_id) df = db_connect.query(call, self.db_connection) return df.sort("rank")
def get_location_info(location_set_version_id, standard_location_set_version_id, db_connection): """ list -> Pandas Data Frame Given a list of country ID numbers will query from the mortality database and return a pandas data frame. The data frame contains columns for location, super region and region ID. """ logger.info("Querying location information.") call = QS.locQueryStr.format( loc_set_ver_id=location_set_version_id, s_loc_set_ver_id=standard_location_set_version_id) df = db_connect.query(call, db_connection) df["path_to_top_parent"] = \ df["path_to_top_parent"].map(lambda x: ",".join((x[2:]).split(",")[:3])) arr = np.array(list(df.path_to_top_parent.map(lambda x: x.split(",")))) df2 = pd.DataFrame(arr.astype(int), columns=["super_region", "region", "country_id"]) return pd.concat( [df[["location_id", "is_estimate", "standard_location"]], df2], axis=1)
def check_model_attribute(model_version_id, model_attribute_name, model_attribute, db_connection='ADDRESS'): """ Checks that the specific model version is truly associated with the model attribute that is specified. :param model_version_id: :param model_attribute_name: :param model_attribute: :param db_connection: :return: """ call = ''' SELECT {} FROM cod.model_version WHERE model_version_id = {} '''.format(model_attribute_name, model_version_id) if not query( call, db_connection)[model_attribute_name].iloc[0] == model_attribute: raise ValueError( 'The model attribute for {} in model_version {} does not match up!' .format(model_attribute_name, model_version_id))
def get_best_process_version_id(decomp_step_id, process_id, gbd_round_id, db_connection): """ Get the best process version from the mortality database. :param decomp_step_id: (int) decomp step :param process_id: (int) process ID for envelope or population :param gbd_round_id: (int) gbd round ID :param db_connection: (int) database connection :return: (int) process version ID """ call = ''' SELECT mdp.proc_version_id FROM mortality.decomp_process_version mdp INNER JOIN mortality.process_version mp ON mp.proc_version_id = mdp.proc_version_id WHERE mdp.decomp_step_id = {} AND mdp.is_best = 1 AND mp.gbd_round_id = {} AND mp.process_id = {} '''.format(decomp_step_id, gbd_round_id, process_id) process_version_id = db_connect.query(call, db_connection)['proc_version_id'][0] return process_version_id
def get_runtime(df, db_connection='ADDRESS', covariates=False): """ Given a dataframe including model_version_ids, return the same dataframe with columns including the runtime associated with each model (both in seconds and as days/hours/minutes Select covariates=True to just get the runtime of covariate selection """ logger.info("Calculating runtime.") call = ''' SELECT min(date_inserted) as start, max(date_inserted) as end FROM cod.model_version_log WHERE model_version_id = {mvid} ''' if covariates: call += ''' AND model_version_log_entry IN ('Running covariate selection started.', 'Running KO process started.')''' else: pass df['total_seconds'] = 0 df['days_hrs_mins'] = '' # for each model version id, calculate the runtime and store it in the df for index, row in df.iterrows(): model_version_id = row['model_version_id'] times = query(call.format(mvid=model_version_id), db_connection) start_time = datetime.strptime(str(times.ix[0, 'start']), '%Y-%m-%d %H:%M:%S') end_time = datetime.strptime(str(times.ix[0, 'end']), '%Y-%m-%d %H:%M:%S') delta = end_time - start_time df.set_value(index, 'total_seconds', delta.total_seconds()) df.set_value(index, 'days_hrs_mins', days_hours_minutes(delta)) return df
def set_new_covariates(models, db_connection, gbd_round_id, decomp_step_id, additional_covariates=None, delete_covariates=None): """ Sets the covariates for all the new models using their prior selected covariates :param models: list of int list of models to add covariates for :param db_connection: str db to connect to :param gbd_round_id: int gbd round ID :param decomp_step_id: int decomp step ID :param additional_covariates: dict dictionary of additional covariates to add, and any non-default features that they need. :param delete_covariates: list of ints list of covariate IDs to delete from the model """ logger.info("Setting new covariates.") call = '''SELECT model_version_id, previous_model_version_id FROM cod.model_version WHERE model_version_id IN ({model_str}); '''.format(model_str=', '.join([str(x) for x in models])) df = db_connect.query(call, db_connection) for i in range(df.shape[0]): new_covariates(df['previous_model_version_id'][i], df['model_version_id'][i], db_connection, gbd_round_id=gbd_round_id, decomp_step_id=decomp_step_id, add_covs=additional_covariates, delete_covs=delete_covariates)