def all_submodel_covs(model_version_id, db_connection): """ :param model_version_id: integer representing a codem model version id :return: Pandas Data frame with submodels, covariates, and other info """ submodels = get_submodels(model_version_id, db_connection) covs = pd.concat([ submodel_covs(x, db_connection) for x in submodels.submodel_version_id ], axis=0).reset_index(drop=True) df = covs.merge(submodels, how="left") df = df.sort_values(["rank", "covariate_name_short"]) call = ''' SELECT submodel_type_id, submodel_type_name FROM cod.submodel_type; ''' df2 = db_connect.query(call, db_connection) df = df.merge(df2, how="left") call = ''' SELECT submodel_dep_id, submodel_dep_name FROM cod.submodel_dep; ''' df2 = db_connect.query(call, db_connection) df = df.merge(df2, how="left") df.drop(["submodel_type_id", "submodel_dep_id"], inplace=True, axis=1) df = df.sort_values(["rank", "covariate_name_short"]) df["approximate_draws"] = np.round(df.weight.values * 1000.) return df
def submodel_covs(submodel_version_id, db_connection): """ :param submodel_version_id: integer representing a codem submodel version id :return: Pandas data frame with information on submodel covariates Given a submodel version id returns the covariates that were used in the construction of that model. Note: this function was re-worked with the database migrations for covariates so that it can pull covariate names via the covariate database but submodel version information from the cod database. """ call = ''' SELECT covariate_model_version_id AS ids FROM cod.submodel_version_covariate WHERE submodel_version_id = {} '''.format(submodel_version_id) covs = db_connect.query(call, db_connection).ids.values.tolist() df = db_connect.query( covNameQueryStr.format(", ".join(str(x) for x in covs)), 'modeling-covariates-db') df.rename(columns={'stub': 'covariate_name_short'}, inplace=True) df.drop('covariate_model_id', axis=1, inplace=True) df['submodel_version_id'] = submodel_version_id return df return df
def getModelParams(model_version_id, db_connection, update=False): ''' integer -> dictionary Given an integer that indicates a valid model version id the function will return a dictionary with keys indicating the model parameters start age, end age, sex, start year, cause, and whether to run covariate selection or not. "update" indicates whether during the querying process the database should be updated to running during the querying process, default is False. True should be used when running CODEm. ''' call = "SELECT * FROM cod.model_version WHERE model_version_id = {0}" df = db_connect.query(call.format(model_version_id), db_connection) model = {k: df[k][0] for k in df.columns} model["start_year"] = 1980 call = "SELECT acause FROM shared.cause WHERE cause_id = {0}" aC = db_connect.query(call.format(model["cause_id"]), db_connection)["acause"][0] model["acause"] = aC call = "SELECT gbd_round FROM shared.gbd_round WHERE gbd_round_id = {0}" gbd_round = int( db_connect.query(call.format(model["gbd_round_id"]), db_connection)["gbd_round"][0]) model["gbd_round"] = gbd_round call = """UPDATE cod.model_version SET status = 0 WHERE model_version_id = {0}""" if update: db_connect.query(call.format(model_version_id), db_connection) return model
def time_stamp(self, procedure): ''' (self, str) -> None Write to the database when a particular part of the procedure has started so that it can be read by the CodViz tool. This enables users to see what stage a model is currently at (i.e. covariate selection, linear model building, space time smoothing, gaussian process, etc.). ''' phrase = "%s started." % procedure call = QS.status_write.format(self.model_version_id, phrase) db_connect.query(call, self.db_connection) self.add_warning(phrase)
def write_submodel(model_version_id, submodel_type_id, submodel_dep_id, weight, rank, db_connection): ''' (int, int, int, float, int) -> int Write a submodel to the table and get the id back ''' call = QS.submodel_query_str.format(model_version_id, submodel_type_id, submodel_dep_id, weight, rank) db_connect.query(call, db_connection) call = QS.submodel_get_id.format(model_version_id, rank) submodel_id_df = db_connect.query(call, db_connection) submodel_id = submodel_id_df["submodel_version_id"][0] return submodel_id
def codQuery(cause_id, sex, start_year, start_age, end_age, location_set_version_id, db_connection, model_version_id, gbd_round_id, outlier_save): ''' strings indicating model parameters -> Pandas Data Frame Given a list of model parameters will query from the COD database and return a pandas data frame. The data frame contains the base variables used in the CODEm process. Also will call the outlier stored procedure in the database to save model outliers if outlier_save is marked as True. ''' call = QS.codQueryStr.format(c=cause_id, s=sex, sy=start_year, sa=start_age, ea=end_age, loc_set_id=location_set_version_id) df = db_connect.query(call, db_connection) if outlier_save == True: outlierSPROC(model_version_id, gbd_round_id, db_connection) df['national'] = df['national'].map(lambda x: x == 1).astype(int) return df
def get_submodel_summary(model_version_id, db_connection): ''' (int) -> data_frame Retrieves the summary submodel rank table for a particular model. ''' call = QS.submodel_summary_query.format(model_version_id) df = db_connect.query(call, db_connection) return df
def covMetaData(model_version_id, db_connection): ''' integer -> Pandas data frame Given an integer that represents a valid model ID number, will return a pandas data frame which contains the covariate model ID's for that model as well as the metadata needed for covariate selection. ''' call = QS.metaQueryStr.format(model_version_id) df = db_connect.query(call, db_connection) return df
def get_submodels(model_version_id, db_connection): """ :param model_version_id: integer representing a codem model version id :return: Pandas Data frame with submodels and corresponding information """ call = ''' SELECT submodel_version_id, rank, weight, submodel_type_id, submodel_dep_id FROM cod.submodel_version WHERE model_version_id = {model_version_id} '''.format(model_version_id=model_version_id) df = db_connect.query(call, db_connection) return df
def covMetaData(model_version_id, db_connection): ''' integer -> Pandas data frame Given an integer that represents a valid model ID number, will return a pandas data frame which contains the covariate model ID's for that model as well as the metadata needed for covariate selection. Note: this was re-worked to call the covariate prior information from the cod database (either dev or prod) and then covariate names from modeling-covariate-db because the model versions must be merged with many tables only available in the covariate database. ''' df = db_connect.query(QS.metaQueryStr.format(mvid=model_version_id), db_connection) models = df.covariate_model_id.values.tolist() names = db_connect.query( QS.covNameQueryStr.format(", ".join(str(x) for x in models)), 'DATABASE') df = df.merge(names, on='covariate_model_id') return df
def get_modeler(self): ''' (self) -> string Look up the modeler for the cause being modeled in the cod.modeler table ''' call = """ SELECT m.username FROM cod.modeler m INNER JOIN shared.cause c ON m.cause_id = c.cause_id WHERE c.acause = '{acause}' AND m.gbd_round_id = {gbd} """.format(acause=self.acause, gbd=self.gbd_round_id) modeler = db_connect.query(call, self.db_connection) return modeler.ix[0, 0].split(', ')
def codQuery(cause_id, sex, start_year, start_age, end_age, location_set_version_id, db_connection): ''' Strings indicating model parameters -> Pandas Data Frame Given a list of model parameters will query from the COD database and return a pandas data frame. The data frame contains the base variables used in the CODEm process. ''' call = QS.codQueryStr.format(c=cause_id, s=sex, sy=start_year, sa=start_age, ea=end_age, loc_set_id=location_set_version_id) df = db_connect.query(call, db_connection) df['national'] = df['national'].map(lambda x: x == 1).astype(int) return df
def covQuery(covID, location_set_version_id): ''' integer -> Pandas data frame Given an integer which represents a valid covariate ID will return a data frame which contains a unique value for each country, year, age group. This data may be aggregated in some form as well. ''' call = QS.cvQueryStr.format(mvid=covID, loc_set_id=location_set_version_id) df = db_connect.query(call, 'DATABASE') if df.shape[0] == 0: err_log = "There appears to be an error with covariate id {0}" sys.stderr.write(err_log.format(covID)) sys.exit() df = df.rename(columns={"mean_value": df["name"][0]}) return df
def locQuery(location_set_version_id, db_connection): ''' list -> Pandas Data Frame Given a list of country ID numbers will query from the mortality database and return a pandas data frame. The data frame contains columns for location, super region and region ID. ''' call = QS.locQueryStr.format(loc_set_ver_id=location_set_version_id) df = db_connect.query(call, db_connection) df["path_to_top_parent"] = \ df["path_to_top_parent"].map(lambda x: ",".join((x[2:]).split(",")[:3])) arr = np.array(list(df.path_to_top_parent.map(lambda x: x.split(",")))) df2 = pd.DataFrame(arr.astype(int), columns=["super_region", "region", "country_id"]) return pd.concat([df["location_id"], df2], axis=1)
def createAgeDF(db_connection): ''' None -> Pandas data frame Creates a Pandas data frame with two columns, all the age groups currently used in analysis at IHME as noted by the data base as well as a column with the code used for the aggregate group. ''' # this age_group_set_id is currently specific to gbd 2016 call = """ SELECT age_group_id as all_ages FROM shared.age_group_set_list WHERE age_group_set_id = 12 AND is_estimate = 1; """ ageDF = db_connect.query(call, db_connection) ageDF['age'] = 22 return ageDF
def acause_from_id(model_version_id, db_connection): """ Given a valid model version id returns the acause associated with it. :param model_version_id: int valid model version id :return: str string representing an acause """ call = ''' SELECT acause FROM shared.cause WHERE cause_id = (SELECT cause_id FROM cod.model_version WHERE model_version_id = {}) '''.format(model_version_id) acause = db_connect.query(call, db_connection)["acause"][0] return acause
def submodel_rank(self): """ Pandas data frame Returns a dataframe of the submodels for the model version id, ranked by weight. """ call = '''SELECT submodel_version_id, submodel_type_id, submodel_dep_id, weight, rank FROM cod.submodel_version WHERE model_version_id = {} ''' call = call.format(self.model_version_id) df = db_connect.query(call, self.db_connection) return df.sort("rank")
def submodel_covs(submodel_version_id, db_connection): """ :param submodel_version_id: integer representing a codem submodel version id :return: Pandas data frame with information on submodel covariates Given a submodel version id returns the covariates that were used in the construction of that model. """ call = ''' SELECT covariate_name_short FROM shared.covariate WHERE covariate_id IN ( SELECT covariate_id FROM covariate.data_version WHERE data_version_id IN ( SELECT data_version_id FROM covariate.model_version WHERE model_version_id IN ( SELECT covariate_model_version_id FROM cod.submodel_version_covariate WHERE submodel_version_id={submodel_version_id}))) '''.format(submodel_version_id=submodel_version_id) df = db_connect.query(call, db_connection) df["submodel_version_id"] = submodel_version_id return df
def get_codem_run_time(model_version_id, db_connection): call = QS.codem_run_time.format(model_version_id=model_version_id) df = db_connect.query(call, db_connection) return float(df["time"][0])
def write_submodel_covariate(submodel_id, list_of_covariate_ids, db_connection): for cov in list_of_covariate_ids: call = QS.submodel_cov_write_str.format(submodel_id, cov) db_connect.query(call, db_connection)
def write_model_pv(tag, value, model_version_id, db_connection): call = QS.pv_write.format(tag, value, model_version_id) db_connect.query(call, db_connection)
def model_version_type_id(model_version_id, db_connection): call = QS.model_version_type_id.format(mvid=model_version_id) m_type_id = db_connect.query(call, db_connection)["model_version_type_id"][0] return m_type_id
def model_date(model_version_id, db_connection): call = QS.model_date.format(mvid=model_version_id) date = db_connect.query(call, db_connection)["model_date"][0] return date
def cause_name_from_id(cause_id, db_connection): call = QS.cause_name.format(cause_id=cause_id) cause_name = db_connect.query(call, db_connection)["cause_name"][0] return cause_name