def get_locations(): ''' Description: get list of locations to iterate through for every part of the maternal custom process, down to one level of subnationals Args: None Output: (list) location_ids ''' enginer = dbapis.engine_factory() loc_set_version_query = ''' SELECT location_set_version_id FROM shared.location_set_version WHERE location_set_id = 35 AND end_date IS NULL''' location_set_version = query_tools.query_2_df( loc_set_version_query, engine=enginer.engines["cod_prod"]).ix[ 0, 'location_set_version_id'] query = ('call shared.view_location_hierarchy_history(%s)' % location_set_version) locations_df = query_tools.query_2_df(query, engine=enginer.engines["cod_prod"]) locations = (locations_df[locations_df['most_detailed'] == 1][ 'location_id'].tolist()) return locations
def create_tables(gbd_env): code_version = str(subprocess.check_output(["git", "rev-parse", "HEAD"])) query = ('SELECT MAX(kit_version_id) AS kit_vers ' 'FROM gbd.kit_version ' 'WHERE kit_id = 2') vers_df = query_tools.query_2_df(query, engine=enginer.engines[gbd_env]) kit_id = vers_df.loc[0, 'kit_vers'] kit_id = int(kit_id) + 1 query = ('CALL gbd.new_gbd_process_version(3, 12, "MMR Upload", "%s", ' 'NULL, %s)' % (code_version.rstrip('\n'), kit_id)) gbd_process = query_tools.query_2_df(query, engine=enginer.engines[gbd_env]) return gbd_process
def get_bundle_id_from_crosswalk( crosswalk_version_id: Optional[int], session: orm.Session ) -> Optional[int]: """ Gets a bundle version ID from a crosswalk version ID. Args: crosswalk_version_id: the crosswalk version ID linked to a bundle ID session: an epi database session Returns: Bundle ID linked to the passed crosswalk version ID. Raises: ValueError: if crosswalk version ID does not exist or is linked to a bundle other than the bundle in the config """ if not crosswalk_version_id: return None bundle_id_df = query_tools.query_2_df( queries.GET_BUNDLE_FROM_CROSSWALK, session=session, parameters={'crosswalk_version_id': crosswalk_version_id} ) if bundle_id_df.empty: raise ValueError( 'Could not retrieve a bundle associated with crosswalk version ' f'{crosswalk_version_id}' ) return bundle_id_df.at[0, columns.BUNDLE_ID]
def get_gbd_covariate_ids( gbd_covariates: List[str], session: orm.Session ) -> pd.DataFrame: """ Gets all GBD covariate IDs and covariate short names. Args: gbd_covariates: List of GBD covariate short names for which to pull covariate IDs session: session with the epi database server Returns: Dataframe containing GBD covariate IDs and covariate short names. Raises: ValueError: if a covariate short name was specified that does not exist in the database """ covariate_df = query_tools.query_2_df(queries.GET_COVARIATE_IDS, session) covariates = set(covariate_df[columns.COVARIATE_NAME_SHORT].unique()) missing_covariates = set(gbd_covariates) - set(covariates) if missing_covariates: raise ValueError( f'Invalid {parameters.GBD_COVARIATES}: ' f'{", ".join(sorted(missing_covariates))}' ) return covariate_df
def test_insert_defaults(covariate_db, cov_conn_def): """Insert a new row into the covariate.model_version without a best_start, best_end, best_user, best_description column to check the default value.""" covariate_db.add_conn_def(cov_conn_def) sesh = get_session(cov_conn_def) insert_q = """ INSERT INTO covariate.model_version (model_version_id, covariate_id, description, code_version, status, is_best, gbd_round_id) VALUES ('1', '1', 'testing defaults', 'version 1', '1', '0', '5'); """ exec_query(insert_q, session=sesh) sesh.commit() select_q = """ SELECT * FROM covariate.model_version WHERE model_version_id = 1; """ res = query_2_df(select_q, session=sesh) assert not res.empty assert not res.at[0, 'best_user'] assert not res.at[0, 'best_description'] assert not res.at[0, 'best_start'] assert not res.at[0, 'best_end'] # we need to close the session or the next test will hang sesh.close()
def test_insert_new_row(covariate_db, cov_conn_def): """Tests the addition of a single row to the covariate.model table.""" initialize_and_fill_covariate_db(covariate_db, cov_conn_def) sesh = get_session(cov_conn_def) insert_q = """ INSERT INTO covariate.model (model_version_id, year_id, location_id, sex_id, age_group_id, mean_value, upper_value, lower_value) VALUES ('1', '2017', '1', '2', '22', '10.4', '11.2', '9.6'), ('1', '2016', '1', '2', '22', '10.5', '11.4', '9.4'), ('1', '2015', '1', '2', '22', '10.6', '11.6', '9.2'), ('1', '2014', '1', '2', '22', '10.7', '11.8', '9'), ('1', '2013', '1', '2', '22', '10.8', '12', '8.8'); """ exec_query(insert_q, session=sesh) sesh.commit() select_q = """ SELECT * FROM covariate.model WHERE model_version_id = 1; """ res = query_2_df(select_q, session=sesh) assert not res.empty # We added five entries associated with model_version_id 1 assert len(res) == 5 # We have to close the session or the next test will hang sesh.close()
def get_locations(self, location_set_id): query = ('SELECT location_id, most_detailed FROM shared.' 'location_hierarchy_history WHERE location_set_version_id=(' 'SELECT location_set_version_id FROM shared.location_set_' 'version WHERE location_set_id = %s AND end_date IS NULL) ' 'AND most_detailed = 1' % location_set_id) loc_df = query_tools.query_2_df( query, engine=self.enginer.engines["cod_prod"]) return loc_df
def _verify_no_data_present(table, schema, model_version_id, session): qry = f""" SELECT count(*) as count from {schema}.{table} WHERE model_version_id = :model_version_id""" count = query_tools.query_2_df( qry, session=session, parameters={'model_version_id': model_version_id})['count'].iat[0] return count == 0
def get_model_version_ids(sesh): """Return a list of model_version_ids. Arguments: sesh (sqlalchemy.orm.session.Session): connection to our database server. """ q = """ SELECT model_version_id FROM covariate.model_version_new ORDER BY model_version_id """ res = query_2_df(q, session=sesh) return res.model_version_id.tolist()
def data_rich_data_poor(self, df): '''Splits a given dataframe into two dataframes, based on data rich or data poor, and returns the two dfs''' query = ('SELECT location_id, parent_id FROM shared.' 'location_hierarchy_history WHERE location_set_version_id=(' 'SELECT location_set_version_id FROM shared.location_set_' 'version WHERE location_set_id = 43 AND end_date IS NULL)') loc_df = query_tools.query_2_df( query, engine=self.enginer.engines["cod_prod"]) all = df.merge(loc_df, on='location_id', how='inner') data_rich = all.query("parent_id==44640") data_rich.drop('parent_id', axis=1, inplace=True) data_poor = all.query("parent_id==44641") data_poor.drop('parent_id', axis=1, inplace=True) return data_rich, data_poor
def get_asfr(self): query = ('SELECT ' 'model.location_id, model.year_id, model.age_group_id, ' 'model.sex_id, model.mean_value AS asfr FROM covariate.model ' 'JOIN covariate.model_version ON model.model_version_id=' 'model_version.model_version_id JOIN covariate.data_version ' 'ON model_version.data_version_id=data_version.' 'data_version_id JOIN shared.covariate ON data_version.' 'covariate_id=covariate.covariate_id ' 'AND covariate.last_updated_action!="DELETE" AND is_best=1 ' 'AND covariate.covariate_id= 13 AND model.age_group_id ' 'BETWEEN 7 AND 15 AND model.year_id = %s' % self.year_id) asfr = query_tools.query_2_df(query, engine=self.enginer.engines["cov_prod"]) asfr['sex_id'] = 2 loc_df = self.get_locations(35) asfr = asfr.merge(loc_df, on='location_id', how='inner') asfr.drop('most_detailed', axis=1, inplace=True) return asfr
def execute_select(query, conn_def='epi', params=None): '''Run a sql query, return a dataframe. Uses ENVIRONMENT_NAME environment variable to determine prod/dev database server. Args: query(str): sql statement to execute conn_def(str, 'epi'): one of 'epi' or 'cod' Returns: Pandas Dataframe ''' engine = get_engine(conn_def=conn_def, env=settings['env_variables']['ENVIRONMENT_NAME']) engine.dispose() Session = sessionmaker(bind=engine) sesh = Session() try: df = qt.query_2_df(query, session=sesh, parameters=params) finally: sesh.close() return df
def test_insert_new_row(covariate_db, cov_conn_def): """Inserts a new row into the model_version table and queries it to make sure that the model version ids match.""" covariate_db.add_conn_def(cov_conn_def) sesh = get_session(cov_conn_def) dummy_mvid = 1 insert_q = """ INSERT INTO covariate.model_version (model_version_id, covariate_id, description, code_version, status, is_best, gbd_round_id) VALUES ('{mvid}', '1', 'best dummy round 5', 'vers 1', '1', '0', '5'); """.format(mvid=dummy_mvid) exec_query(insert_q, session=sesh) sesh.commit() select_q = """ SELECT * FROM covariate.model_version WHERE model_version_id = {mvid}; """.format(mvid=dummy_mvid) res = query_2_df(select_q, session=sesh) assert not res.empty assert res.at[0, 'model_version_id'] == dummy_mvid # we need to close the session or the next test will hang sesh.close()
only_subnats = adjust_df[adjust_df.subnationals == 1] only_subnats.rename(columns={'location_id': 'parent_id'}, inplace=True) query = '''SELECT location_id, parent_id FROM shared.location_hierarchy_history WHERE location_set_id = 35 AND location_set_version_id = (SELECT location_set_version_id FROM shared.location_set_version WHERE location_set_id = 35 AND end_date IS NULL)''' lhh = query_tools.query_2_df(query, engine=enginer.engines['cod_prod']) only_subnats = only_subnats.merge(lhh, on='parent_id', how='inner') only_subnats.drop('parent_id', axis=1, inplace=True) adjust_df = adjust_df[adjust_df.subnationals == 0] adjust_df = pd.concat([adjust_df, only_subnats]) # get original codem envelope env_fname = '%s/deaths_female.h5' % envelope_dir rlog.log('envelope is %s' % env_fname) env = pd.read_hdf('%s' % env_fname, 'data') # we only want maternal age groups and most-detailed locations env = env[(env.age_group_id.isin(range(7, 16))) & (env.location_id.isin(locs))] # get prop from Late dismod model query = ('''SELECT
from db_tools import dbapis, query_tools import subprocess import time # create enginer enginer = dbapis.engine_factory() query = (''' SELECT * FROM cod.model_version WHERE cause_id = 366 AND model_version_type_id = 3 AND is_best = 1''') while True: codem_df = query_tools.query_2_df(query, engine=enginer.engines["cod_prod"]) if len(codem_df) == 1: break else: print "CODEm model still running" time.sleep(600) call = ('qsub -cwd -P proj_custom_models -o FILEPATH ' '-e FILEPATH -pe multi_slot 10 ' '-N master_maternal cluster_shell.sh 00_master_maternal.py') subprocess.call(call, shell=True)
'FROM ' 'gbd.gbd_process_version_metadata gpvm ' 'JOIN ' 'gbd.gbd_process_version USING (gbd_process_version_id) ' 'JOIN ' 'gbd.compare_version_output USING (compare_version_id) ' 'WHERE ' 'compare_version_id = (SELECT ' 'compare_version_id ' 'FROM ' 'gbd.compare_version ' 'WHERE ' 'compare_version_status_id = 1 ' 'AND gbd_round_id = 3) ' 'AND gpvm.metadata_type_id = 5') model_vers = query_tools.query_2_df( query, engine=enginer.engines["gbd_prod"]).loc[0, 'daly_id'] # load dalynator draws for the appropriate cause dalynator_dir = '/ihme/centralcomp/dalynator/%s/draws/hdfs/' % model_vers files = [] for root, dirnames, filenames in os.walk('%s' % dalynator_dir): for filename in fnmatch.filter(filenames, '*%s.h5' % year_id): files.append(os.path.join(root, filename)) def read_file(f): return pd.read_hdf(f, 'data', where=[("'cause_id'==%d & 'measure_id'==1" "& 'metric_id'==1 & 'sex_id'==2" "& 'rei_id'==0") % cause_id]) draw_list = []
try: from db_tools import dbapis, query_tools import db_process_upload import maternal_fns except: sys.path.append(str(os.getcwd()).rstrip('/mmr')) from db_tools import dbapis, query_tools import db_process_upload import maternal_fns enginer = dbapis.engine_factory() query = ('SELECT cause_id FROM shared.cause_hierarchy_history ' 'WHERE cause_set_id = 8 AND cause_set_version_id = ' '(SELECT cause_set_version_id FROM shared.cause_set_version ' 'WHERE cause_set_id = 8 and end_date IS NULL)') causes = (query_tools.query_2_df( query, engine=enginer.engines["cod_prod"])['cause_id'].tolist()) # set out directory date_regex = re.compile('\W') date_unformatted = str(datetime.now())[0:13] date_str = date_regex.sub('_', date_unformatted) out_dir = '/ihme/centralcomp/maternal_mortality/mmr/%s' % date_str arc_out_dir = '%s/multi_year' % out_dir mmr_out_dir = '%s/single_year' % out_dir if not os.path.exists('%s' % out_dir): os.makedirs('%s' % out_dir) if not os.path.exists('%s' % arc_out_dir): os.makedirs('%s' % arc_out_dir) if not os.path.exists('%s' % mmr_out_dir): os.makedirs('%s' % mmr_out_dir)
c = read_json(credential_path) return c[key]['user'], c[key]['password'] user, password = get_credentials( "save_results_cod", credential_path=('/ihme/code/central_comp/secrets/' 'db_credentials.json')) enginer.define_engine(engine_name='cod_prod', server_name='cod', default_schema='cod', envr='prod', user=user, password=password, replace=True) try: unmark = query_tools.query_2_df(unmark_sql_statement, engine=enginer.engines['cod_prod']) except: print "caught the 'no rows returned' bug error of sqlalchemy" # ------------------------------------------------------------------------- ############################################################## # 03: GET FINAL DEATH COUNTS # For each subcause, multiply the cause # fractions from step 1 with the envelope from the late correction of codem, # giving estimates of death counts for each subtype. ############################################################## # wait for cause fractions to be saved to the database if it was run this round maternal_fns.wait('adjustment_save_results', 300)
def get_best_date(enginer, step, dep_type, dep_id=""): ''' Description: Queries the database for the best_start date of the most recent model version for the given cause_id/modelable_entity_id/process. Also pulls the most recent date of timestamped files. See dependency_map.csv for context. Args: enginer: connection to the db step: step of the process we're on dep_type: "cause_id" or "modelable_entity_id" dep_id: the modelable_entity_id, cod_correct id, or codem_id for which you want to get the best_start date. NOTE: dep_id REQUIRED for dismod process Output: (datetime) best start date ''' if dep_type == 'cause_id': query = ''' SELECT best_start from cod.model_version where best_end IS NULL AND best_start IS NOT NULL AND best_start > "2015-01-01 00:00:01" AND cause_id = %s''' % dep_id most_recent_best_df = query_tools.query_2_df( query, engine=enginer.engines["cod_prod"]) if len(most_recent_best_df) == 0: most_recent_best = datetime(1800, 01, 01, 00, 00, 00) else: most_recent_best = (most_recent_best_df.ix[0, 'best_start']. to_datetime()) elif dep_type == 'filename': fdict = {} for filename in os.listdir(dep_id): if 'pafs_draws' in filename: if 'hdf' not in filename and not filename.startswith('.'): d = filename.split('_')[2:5] time = ' 00:00:00' d[2] = d[2].rstrip(".csv") ord_date = date(year=int(d[0]), month=int(d[1]), day=int(d[2])).toordinal() fdict[ord_date] = d ord_dates = fdict.keys() ord_dates.sort() most_recent_str = str( fdict[ord_dates[-1]] ).replace("['", "").replace("']", "").replace("', '", "-") + time most_recent_best = datetime.strptime( most_recent_str, "%Y-%m-%d %H:%M:%S") elif dep_type == 'modelable_entity_id': query = ''' SELECT best_start from epi.model_version where is_best = 1 AND best_end IS NULL AND modelable_entity_id = %s''' % dep_id most_recent_best_df = query_tools.query_2_df( query, engine=enginer.engines["epi_prod"]) if len(most_recent_best_df) == 0: most_recent_best = datetime(1800, 01, 01, 00, 00, 00) else: most_recent_best = (most_recent_best_df.ix[0, 'best_start']. to_datetime()) elif dep_type == 'process': query = ''' SELECT best_start FROM cod.output_version WHERE is_best = 1 AND env_version = (SELECT MAX(env_version) FROM cod.output_version)''' most_recent_best_df = query_tools.query_2_df( query, engine=enginer.engines["cod_prod"]) if len(most_recent_best_df) == 0: most_recent_best = datetime(1800, 01, 01, 00, 00, 00) elif most_recent_best_df.ix[0, 'best_start'] is None: most_recent_best = datetime(1800, 01, 01, 00, 00, 00) else: most_recent_best = (most_recent_best_df.ix[0, 'best_start']. to_datetime()) else: raise ValueError( '''Dep_type must be "cause_id", "modelable_entity_id", or "process"''' ) return most_recent_best
def get_model_vers(process, model_id=None, step=None): ''' Description: Queries the database for the best model_version for the given model_id. Can do this for Dismod, Codem, or Codcorrect outputs. Args: 1. (str) process ('dismod', 'codem', or 'codcorrect') 2. id (model_id for dismod, cause_id for codem, or none for codcorrect) Output: (int) best model_version ''' enginer = dbapis.engine_factory() enginer.servers["gbd"] = {"prod": "modeling-gbd-db.ihme.washington.edu"} enginer.define_engine(strConnection) if model_id is not None: model_id = int(model_id) if process == 'dismod': if model_id is None: raise ValueError('Must specify a me_id') else: query = '''SELECT model_version_id from epi.model_version WHERE is_best = 1 AND best_end IS NULL AND modelable_entity_id = %d''' % model_id model_vers_df = query_tools.query_2_df( query, engine=enginer.engines['epi_prod']) if len(model_vers_df) > 0: model_vers = model_vers_df.ix[0, 'model_version_id'] else: model_vers = None elif process == 'codem': if model_id is None: raise ValueError('Must specify a cause_id') if step == 2: query = '''SELECT MAX(model_version_id) as model_version_id from cod.model_version where best_start IS NOT NULL AND best_start > '2015-01-01 00:00:01' AND cause_id = %d and model_version_type_id = 3''' % model_id else: query = '''SELECT model_version_id from cod.model_version where best_end IS NULL AND best_start IS NOT NULL AND best_start > '2015-01-01 00:00:01' AND cause_id = %d''' % model_id model_vers = query_tools.query_2_df(query, engine=enginer.engines[ "cod_prod"]).ix[0, 'model_version_id'] else: query = ('SELECT ' 'distinct(val) AS daly_id ' 'FROM ' 'gbd.gbd_process_version_metadata gpvm ' 'JOIN ' 'gbd.gbd_process_version USING (gbd_process_version_id) ' 'JOIN ' 'gbd.compare_version_output USING (compare_version_id) ' 'WHERE ' 'compare_version_id = (SELECT ' 'compare_version_id ' 'FROM ' 'gbd.compare_version ' 'WHERE ' 'compare_version_status_id = 1 ' 'AND gbd_round_id = 3) ' 'AND gpvm.metadata_type_id = 5') model_vers = query_tools.query_2_df( query, engine=enginer.engines["gbd_prod"]).loc[0, 'daly_id'] return model_vers
def __init__(self, cluster_dir, year_id, input_me, output_me, infertile_me): Base.__init__(self, cluster_dir, year_id, input_me, output_me) self.infertile_me = infertile_me k_cols, i_cols, d_cols = self.keep_cols() # get incidence draws draws = self.get_draws() # create new incidence asfr = self.get_asfr() new_inc = self.get_new_incidence(draws, asfr) self.output(new_inc, output_me, 6) # create new prevalence duration = self.create_draws(0.01918, 0.0137, 0.0274) new_prev = self.mul_draws(new_inc, duration) self.output(new_prev, output_me, 5) # create incidence of infertility & output as input data for that model infertility_sev = self.create_draws(0.09, 0.077, 0.104) infert_inc = self.mul_draws(new_inc, infertility_sev) # get mean/upper/lower infert_inc.set_index(i_cols, inplace=True) infert_inc = infert_inc.transpose().describe( percentiles=[.025, .975]).transpose()[['mean', '2.5%', '97.5%']] infert_inc.rename(columns={ '2.5%': 'lower', '97.5%': 'upper' }, inplace=True) infert_inc.index.rename(i_cols, inplace=True) infert_inc.reset_index(inplace=True) # get year_start, year_end infert_inc['year_start'] = infert_inc['year_id'] infert_inc['year_end'] = infert_inc['year_id'] # get age_start and age_end query = "SELECT age_group_id, age_group_name FROM shared.age_group" age_df = query_tools.query_2_df( query, engine=self.enginer.engines['cod_prod']) age_df = age_df.query("age_group_id < 22") age_df = age_df[age_df.age_group_name.str.contains('to')] age_df['age_start'], age_df['age_end'] = zip( *age_df['age_group_name'].apply(lambda x: x.split(' to ', 1))) age_df['age_start'] = age_df['age_start'].astype(int) age_df['age_end'] = age_df['age_end'].astype(int) infert_inc = infert_inc.merge(age_df, on='age_group_id', how='left') # get location_name query = ('SELECT location_id, location_ascii_name AS location_name ' 'FROM shared.location_hierarchy_history LEFT JOIN shared.' 'location USING(location_id) WHERE ' 'location_set_version_id=(SELECT location_set_version_id ' 'FROM shared.location_set_version WHERE location_set_id = 9 ' 'and end_date IS NULL) AND most_detailed=1') loc_df = query_tools.query_2_df( query, engine=self.enginer.engines['cod_prod']) infert_inc = infert_inc.merge(loc_df, on='location_id', how='inner') # get sex infert_inc['sex'] = infert_inc['sex_id'].map({2: 'Female'}) infert_inc.drop( ['year_id', 'sex_id', 'age_group_id', 'age_group_name'], axis=1, inplace=True) # add other necessary cols for the epi uploader infert_inc['modelable_entity_id'] = infertile_me query = ('SELECT modelable_entity_name FROM epi.modelable_entity ' 'WHERE modelable_entity_id = %s' % infertile_me) infert_inc['modelable_entity_name'] = (query_tools.query_2_df( query, engine=self.enginer.engines['epi_prod']).ix[ 0, 'modelable_entity_name']) infert_inc['nid'] = 254237 empty_cols = [ 'row_num', 'parent_id', 'input_type', 'underlying_nid', 'underlying_field_citation_value', 'field_citation_value', 'file_path', 'page_num', 'table_num', 'ihme_loc_id', 'smaller_site_unit', 'site_memo', 'age_demographer', 'standard_error', 'effective_sample_size', 'cases', 'sample_size', 'design_effect', 'measure_adjustment', 'recall_type_value', 'sampling_type', 'response_rate', 'case_name', 'case_definition', 'case_diagnostics', 'group', 'specificity', 'group_review', 'note_modeler', 'note_SR', 'extractor', 'data_sheet_filepath' ] for col in empty_cols: infert_inc['%s' % col] = np.nan infert_inc['sex_issue'] = 0 infert_inc['year_issue'] = 0 infert_inc['age_issue'] = 0 infert_inc['measure'] = "incidence" infert_inc['measure_issue'] = 0 infert_inc['representative_name'] = "Unknown" infert_inc['urbanicity_type'] = "Unknown" infert_inc['unit_type'] = "Person" infert_inc['unit_value_as_published'] = 1 infert_inc['is_outlier'] = 0 infert_inc['recall_type'] = "Point" infert_inc['uncertainty_type'] = "Confidence interval" infert_inc['uncertainty_type_value'] = 95 infert_inc['source_type'] = "Mixed or estimation" self.output_for_epiuploader(infert_inc, infertile_me)