def branch_or_leaf(dag: LocationDAG, location_id: int, sex: int, model_version_id: int, parent_location: int, parent_sex: int, n_sim: int, n_pool: int, upstream: List[str], tasks: List[_CascadeOperation]): """ Recursive function that either creates a branch (by calling itself) or a leaf fit depending on whether or not it is at a terminal node. Determines if it's at a terminal node using the dag.successors() method from networkx. Appends tasks onto the tasks parameter. """ if not dag.is_leaf(location_id=location_id): branch = branch_fit( model_version_id=model_version_id, location_id=location_id, sex_id=sex, prior_parent=parent_location, prior_sex=parent_sex, child_locations=dag.children(location_id), child_sexes=[sex], n_sim=n_sim, n_pool=n_pool, upstream_commands=upstream, ode_fit_strategy=True ) tasks += branch for location in dag.children(location_id): branch_or_leaf(dag=dag, location_id=location, sex=sex, model_version_id=model_version_id, parent_location=location_id, parent_sex=sex, n_sim=n_sim, n_pool=n_pool, upstream=[branch[-1].command], tasks=tasks) else: leaf = leaf_fit( model_version_id=model_version_id, location_id=location_id, sex_id=sex, prior_parent=parent_location, prior_sex=parent_sex, n_sim=n_sim, n_pool=n_pool, upstream_commands=upstream, ode_fit_strategy=True ) tasks += leaf
def test_location_drill_start_end(ihme): these_settings = deepcopy(BASE_CASE) model_settings = these_settings["model"] tree = LocationDAG(these_settings['location_set_version_id'], these_settings['gbd_round_id']) region_ids = tree.parent_children(1) parent_test_loc = choice(region_ids) test_children = list(tree.parent_children(parent_test_loc)) num_test_children = randint(2, len(test_children)) children_test_locs = sample(test_children, num_test_children) num_descendants = 0 for child in children_test_locs: num_descendants += len(tree.descendants(child)) model_settings['drill_location_end'] = children_test_locs model_settings['drill_location_start'] = parent_test_loc these_settings['model'] = model_settings s = load_settings(these_settings) mi = MeasurementInputsFromSettings(settings=s) # demographics.location_id shoul be set to all descendants of each # location in drill_location_end, plus drill_location_end locations # themselves, plus the drill_location_start location assert len(mi.demographics.location_id) == (num_descendants + len(children_test_locs) + 1) assert len(mi.demographics.drill_locations) == (len(children_test_locs) + 1)
def construct_node_table(location_dag: LocationDAG) -> pd.DataFrame: """ Constructs the node table from a location DAG's to_dataframe() method. Parameters ---------- location_dag location hierarchy object """ LOG.info("Constructing node table.") node = location_dag.to_dataframe() node = node.reset_index(drop=True) node["node_id"] = node.index p_node = node[["node_id", "location_id"]].rename(columns={ "location_id": "parent_id", "node_id": "parent" }) node = node.merge(p_node, on="parent_id", how="left") node.rename(columns={ "name": "node_name", "location_id": "c_location_id" }, inplace=True) node = node[['node_id', 'node_name', 'parent', 'c_location_id']] return node
def __init__(self, gbd_round_id: int, location_set_version_id: Optional[int] = None): """ Grabs and stores demographic information needed for shared functions. Will also make a location hierarchy dag. Parameters ---------- gbd_round_id The GBD round location_set_version_id The location set version to use (right now EpiViz-AT is passing dismod location set versions, but this will eventually switch to the cause of death hierarchy that is more extensive). """ demographics = db_queries.get_demographics(gbd_team='epi', gbd_round_id=gbd_round_id) self.age_group_id = demographics['age_group_id'] self.sex_id = demographics['sex_id'] + [3] cod_demographics = db_queries.get_demographics( gbd_team='cod', gbd_round_id=gbd_round_id) self.year_id = cod_demographics['year_id'] if location_set_version_id: location_dag = LocationDAG( location_set_version_id=location_set_version_id, gbd_round_id=gbd_round_id) self.location_id = list(location_dag.dag.nodes) self.drill_locations = list(location_dag.dag.nodes) else: self.location_id = [] self.drill_locations = []
def test_no_drill(ihme): these_settings = deepcopy(BASE_CASE) model_settings = these_settings["model"] tree = LocationDAG(these_settings['location_set_version_id'], these_settings['gbd_round_id']) num_descendants = len(tree.descendants(1)) model_settings.pop('drill_location_end') model_settings.pop('drill_location_start') these_settings['model'] = model_settings s = load_settings(these_settings) mi = MeasurementInputsFromSettings(settings=s) # since we haven't set either drill_location_start or # drill_location_end, demographics.location_id should be set # to the entire hierarchy assert len(mi.demographics.location_id) == num_descendants + 1 assert len(mi.demographics.drill_locations) == num_descendants + 1
def test_location_drill_start_only(ihme): these_settings = deepcopy(BASE_CASE) model_settings = these_settings["model"] tree = LocationDAG(these_settings['location_set_version_id'], these_settings['gbd_round_id']) region_ids = tree.parent_children(1) test_loc = choice(region_ids) num_descendants = len(tree.descendants(test_loc)) num_mr_locs = len(tree.parent_children(test_loc)) model_settings.pop("drill_location_end") model_settings['drill_location_start'] = test_loc these_settings["model"] = model_settings s = load_settings(these_settings) mi = MeasurementInputsFromSettings(settings=s) # with drill_location_end unset, demographics.location_id should # be set to all descendants of the test loc, plus the test loc itself assert len(mi.demographics.location_id) == num_descendants + 1 assert len(mi.demographics.drill_locations) == num_mr_locs
def __init__(self, gbd_round_id: int, location_set_version_id: Optional[int] = None): """ Demographic groups needed for shared functions. """ demographics = db_queries.get_demographics(gbd_team='epi', gbd_round_id=gbd_round_id) self.age_group_id = demographics['age_group_id'] self.sex_id = demographics['sex_id'] + [3] cod_demographics = db_queries.get_demographics( gbd_team='cod', gbd_round_id=gbd_round_id) self.year_id = cod_demographics['year_id'] if location_set_version_id: location_dag = LocationDAG( location_set_version_id=location_set_version_id, gbd_round_id=gbd_round_id) self.location_id = list(location_dag.dag.nodes) self.mortality_rate_location_id = list(location_dag.dag.nodes) else: self.location_id = [] self.mortality_rate_location_id = []
def make_cascade_dag(model_version_id: int, dag: LocationDAG, location_start: int, sex_start: int, split_sex: bool, n_sim: int = 100, n_pool: int = 100, skip_configure: bool = False) -> List[_CascadeOperation]: """ Make a traditional cascade dag for a model version. Relies on a location DAG and a starting point in the DAG for locations and sexes. Parameters ---------- model_version_id Model version ID dag A location DAG that specifies the location hierarchy location_start Where to start in the location hierarchy sex_start Which sex to start with, can be most detailed or both. split_sex Whether or not to split sex into most detailed. If not, then will just stay at 'both' sex. n_sim Number of simulations to do in sample simulate n_pool Number of multiprocessing pools to create during sample simulate skip_configure Don't configure inputs. Only do this if it's already been done. Returns ------- List of _CascadeOperation. """ tasks = [] sexes = [sex_start] if SEX_ID_TO_NAME[sex_start] == 'Both': if split_sex: sexes = [ SEX_NAME_TO_ID['Female'], SEX_NAME_TO_ID['Male'] ] top_level = root_fit( model_version_id=model_version_id, location_id=location_start, sex_id=sex_start, child_locations=dag.children(location_start), child_sexes=sexes, mulcov_stats=True, skip_configure=skip_configure, n_sim=n_sim, n_pool=n_pool, ode_fit_strategy=True, ) tasks += top_level for sex in sexes: for location1 in dag.children(location_start): branch_or_leaf( dag=dag, location_id=location1, sex=sex, model_version_id=model_version_id, parent_location=location_start, parent_sex=sex_start, n_sim=n_sim, n_pool=n_pool, upstream=[top_level[-1].command], tasks=tasks ) tasks.append(Upload( model_version_id=model_version_id, fit=True, prior=True, upstream_commands=[tasks[-1].command], executor_parameters={ 'm_mem_free': '50G' } )) return tasks
def test_dag_error_noargs(): with pytest.raises(LocationDAGError): LocationDAG()
def __init__(self, model_version_id: int, gbd_round_id: int, decomp_step_id: int, conn_def: str, country_covariate_id: List[int], csmr_cause_id: int, crosswalk_version_id: int, csmr_process_version_id: Optional[int] = None, location_set_version_id: Optional[int] = None, drill_location_start: Optional[int] = None, drill_location_end: Optional[List[int]] = None): """ The class that constructs all of the measurement inputs. Pulls ASDR, CSMR, crosswalk versions, and country covariates, and puts them into one data frame that then formats itself for the dismod database. Performs covariate value interpolation if age and year ranges don't match up with GBD age and year ranges. Parameters ---------- model_version_id the model version ID gbd_round_id the GBD round ID decomp_step_id the decomp step ID csmr_process_version_id process version ID for CSMR csmr_cause_id: (int) cause to pull CSMR from crosswalk_version_id crosswalk version to use country_covariate_id list of covariate IDs conn_def connection definition from .odbc file (e.g. 'epi') to connect to the IHME databases location_set_version_id can be None, if it's none, get the best location_set_version_id for estimation hierarchy of this GBD round drill_location_start which location ID to drill from as the parent drill_location_end which immediate children of the drill_location_start parent to include in the drill Attributes ---------- self.decomp_step : str the decomp step in string form self.demographics : cascade_at.inputs.demographics.Demographics a demographics object that specifies the age group, sex, location, and year IDs to grab self.integrand_map : Dict[int, int] dictionary mapping from GBD measure IDs to DisMod IDs self.asdr : cascade_at.inputs.asdr.ASDR all-cause mortality input object self.csmr : cascade_at.inputs.csmr.CSMR cause-specific mortality input object from cause csmr_cause_id self.data : cascade_at.inputs.data.CrosswalkVersion crosswalk version data from IHME database self.covariate_data : List[cascade_at.inputs.covariate_data.CovariateData] list of covariate data objects that contains the raw covariate data mapped to IDs self.location_dag : cascade_at.inputs.locations.LocationDAG DAG of locations to be used self.population: (cascade_at.inputs.population.Population) population object that is used for covariate weighting self.data_eta: (Dict[str, float]): dictionary of eta value to be applied to each measure self.density: (Dict[str, str]): dictionary of density to be applied to each measure self.nu: (Dict[str, float]): dictionary of nu value to be applied to each measure self.dismod_data: (pd.DataFrame) resulting dismod data formatted to be used in the dismod database Examples -------- >>> from cascade_at.settings.base_case import BASE_CASE >>> from cascade_at.settings.settings import load_settings >>> >>> settings = load_settings(BASE_CASE) >>> covariate_id = [i.country_covariate_id for i in settings.country_covariate] >>> >>> i = MeasurementInputs( >>> model_version_id=settings.model.model_version_id, >>> gbd_round_id=settings.gbd_round_id, >>> decomp_step_id=settings.model.decomp_step_id, >>> csmr_process_version_id=None, >>> csmr_cause_id = settings.model.add_csmr_cause, >>> crosswalk_version_id=settings.model.crosswalk_version_id, >>> country_covariate_id=covariate_id, >>> conn_def='epi', >>> location_set_version_id=settings.location_set_version_id >>> ) >>> i.get_raw_inputs() >>> i.configure_inputs_for_dismod(settings) """ LOG.info(f"Initializing input object for model version ID {model_version_id}.") LOG.info(f"GBD Round ID {gbd_round_id}.") LOG.info(f"Pulling from connection {conn_def}.") self.model_version_id = model_version_id self.gbd_round_id = gbd_round_id self.decomp_step_id = decomp_step_id self.csmr_process_version_id = csmr_process_version_id self.csmr_cause_id = csmr_cause_id self.crosswalk_version_id = crosswalk_version_id self.country_covariate_id = country_covariate_id self.conn_def = conn_def self.drill_location_start = drill_location_start self.drill_location_end = drill_location_end self.decomp_step = ds.decomp_step_from_decomp_step_id(self.decomp_step_id) if location_set_version_id is None: self.location_set_version_id = get_location_set_version_id(gbd_round_id=self.gbd_round_id) else: self.location_set_version_id = location_set_version_id self.demographics = Demographics( gbd_round_id=self.gbd_round_id, location_set_version_id=self.location_set_version_id) self.location_dag = LocationDAG( location_set_version_id=self.location_set_version_id, gbd_round_id=self.gbd_round_id ) # Need to subset the locations to only those needed for # the drill. drill_locations_all is the set of locations # to pull data for, including all descendents. drill_locations # is the set of locations just parent-children in the drill. drill_locations_all, drill_locations = locations_by_drill( drill_location_start=self.drill_location_start, drill_location_end=self.drill_location_end, dag=self.location_dag ) if drill_locations_all: self.demographics.location_id = drill_locations_all self.demographics.drill_locations = drill_locations self.exclude_outliers = True self.asdr = None self.csmr = None self.population = None self.data = None self.covariates = None self.age_groups = None self.data_eta = None self.density = None self.nu = None self.measures_to_exclude = None self.dismod_data = None self.covariate_data = None self.country_covariate_data = None self.covariate_specs = None self.omega = None
class MeasurementInputs: def __init__(self, model_version_id: int, gbd_round_id: int, decomp_step_id: int, conn_def: str, country_covariate_id: List[int], csmr_cause_id: int, crosswalk_version_id: int, csmr_process_version_id: Optional[int] = None, location_set_version_id: Optional[int] = None, drill_location_start: Optional[int] = None, drill_location_end: Optional[List[int]] = None): """ The class that constructs all of the measurement inputs. Pulls ASDR, CSMR, crosswalk versions, and country covariates, and puts them into one data frame that then formats itself for the dismod database. Performs covariate value interpolation if age and year ranges don't match up with GBD age and year ranges. Parameters ---------- model_version_id the model version ID gbd_round_id the GBD round ID decomp_step_id the decomp step ID csmr_process_version_id process version ID for CSMR csmr_cause_id: (int) cause to pull CSMR from crosswalk_version_id crosswalk version to use country_covariate_id list of covariate IDs conn_def connection definition from .odbc file (e.g. 'epi') to connect to the IHME databases location_set_version_id can be None, if it's none, get the best location_set_version_id for estimation hierarchy of this GBD round drill_location_start which location ID to drill from as the parent drill_location_end which immediate children of the drill_location_start parent to include in the drill Attributes ---------- self.decomp_step : str the decomp step in string form self.demographics : cascade_at.inputs.demographics.Demographics a demographics object that specifies the age group, sex, location, and year IDs to grab self.integrand_map : Dict[int, int] dictionary mapping from GBD measure IDs to DisMod IDs self.asdr : cascade_at.inputs.asdr.ASDR all-cause mortality input object self.csmr : cascade_at.inputs.csmr.CSMR cause-specific mortality input object from cause csmr_cause_id self.data : cascade_at.inputs.data.CrosswalkVersion crosswalk version data from IHME database self.covariate_data : List[cascade_at.inputs.covariate_data.CovariateData] list of covariate data objects that contains the raw covariate data mapped to IDs self.location_dag : cascade_at.inputs.locations.LocationDAG DAG of locations to be used self.population: (cascade_at.inputs.population.Population) population object that is used for covariate weighting self.data_eta: (Dict[str, float]): dictionary of eta value to be applied to each measure self.density: (Dict[str, str]): dictionary of density to be applied to each measure self.nu: (Dict[str, float]): dictionary of nu value to be applied to each measure self.dismod_data: (pd.DataFrame) resulting dismod data formatted to be used in the dismod database Examples -------- >>> from cascade_at.settings.base_case import BASE_CASE >>> from cascade_at.settings.settings import load_settings >>> >>> settings = load_settings(BASE_CASE) >>> covariate_id = [i.country_covariate_id for i in settings.country_covariate] >>> >>> i = MeasurementInputs( >>> model_version_id=settings.model.model_version_id, >>> gbd_round_id=settings.gbd_round_id, >>> decomp_step_id=settings.model.decomp_step_id, >>> csmr_process_version_id=None, >>> csmr_cause_id = settings.model.add_csmr_cause, >>> crosswalk_version_id=settings.model.crosswalk_version_id, >>> country_covariate_id=covariate_id, >>> conn_def='epi', >>> location_set_version_id=settings.location_set_version_id >>> ) >>> i.get_raw_inputs() >>> i.configure_inputs_for_dismod(settings) """ LOG.info(f"Initializing input object for model version ID {model_version_id}.") LOG.info(f"GBD Round ID {gbd_round_id}.") LOG.info(f"Pulling from connection {conn_def}.") self.model_version_id = model_version_id self.gbd_round_id = gbd_round_id self.decomp_step_id = decomp_step_id self.csmr_process_version_id = csmr_process_version_id self.csmr_cause_id = csmr_cause_id self.crosswalk_version_id = crosswalk_version_id self.country_covariate_id = country_covariate_id self.conn_def = conn_def self.drill_location_start = drill_location_start self.drill_location_end = drill_location_end self.decomp_step = ds.decomp_step_from_decomp_step_id(self.decomp_step_id) if location_set_version_id is None: self.location_set_version_id = get_location_set_version_id(gbd_round_id=self.gbd_round_id) else: self.location_set_version_id = location_set_version_id self.demographics = Demographics( gbd_round_id=self.gbd_round_id, location_set_version_id=self.location_set_version_id) self.location_dag = LocationDAG( location_set_version_id=self.location_set_version_id, gbd_round_id=self.gbd_round_id ) # Need to subset the locations to only those needed for # the drill. drill_locations_all is the set of locations # to pull data for, including all descendents. drill_locations # is the set of locations just parent-children in the drill. drill_locations_all, drill_locations = locations_by_drill( drill_location_start=self.drill_location_start, drill_location_end=self.drill_location_end, dag=self.location_dag ) if drill_locations_all: self.demographics.location_id = drill_locations_all self.demographics.drill_locations = drill_locations self.exclude_outliers = True self.asdr = None self.csmr = None self.population = None self.data = None self.covariates = None self.age_groups = None self.data_eta = None self.density = None self.nu = None self.measures_to_exclude = None self.dismod_data = None self.covariate_data = None self.country_covariate_data = None self.covariate_specs = None self.omega = None def get_raw_inputs(self): """ Get the raw inputs that need to be used in the modeling. """ LOG.info("Getting all raw inputs.") self.asdr = ASDR( demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id ).get_raw() self.csmr = CSMR( cause_id=self.csmr_cause_id, demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id, process_version_id=self.csmr_process_version_id ).get_raw() self.data = CrosswalkVersion( crosswalk_version_id=self.crosswalk_version_id, exclude_outliers=self.exclude_outliers, demographics=self.demographics, conn_def=self.conn_def, gbd_round_id=self.gbd_round_id ).get_raw() self.covariate_data = [CovariateData( covariate_id=c, demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id ).get_raw() for c in self.country_covariate_id] self.population = Population( demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id ).get_population() def configure_inputs_for_dismod(self, settings: SettingsConfig, midpoint: bool = False, mortality_year_reduction: int = 5): """ Modifies the inputs for DisMod based on model-specific settings. Arguments --------- settings Settings for the model mortality_year_reduction number of years to decimate csmr and asdr """ self.data_eta = data_eta_from_settings(settings) self.density = density_from_settings(settings) self.nu = nu_from_settings(settings) self.measures_to_exclude = measures_to_exclude_from_settings(settings) # If we are constraining omega, then we want to hold out the data # from the DisMod fit for ASDR (but never CSMR -- always want to fit # CSMR). data = self.data.configure_for_dismod( measures_to_exclude=self.measures_to_exclude, relabel_incidence=settings.model.relabel_incidence, midpoint=midpoint, ) asdr = self.asdr.configure_for_dismod( hold_out=settings.model.constrain_omega) csmr = self.csmr.configure_for_dismod(hold_out=0) if settings.model.constrain_omega: self.omega = calculate_omega(asdr=asdr, csmr=csmr) else: self.omega = None if not csmr.empty: csmr = decimate_years( data=csmr, num_years=mortality_year_reduction) if not asdr.empty: asdr = decimate_years( data=asdr, num_years=mortality_year_reduction) self.dismod_data = pd.concat([data, asdr, csmr], axis=0, sort=True) self.dismod_data.reset_index(drop=True, inplace=True) self.dismod_data["density"] = self.dismod_data.measure.apply( self.density.__getitem__) self.dismod_data["eta"] = self.dismod_data.measure.apply( self.data_eta.__getitem__) self.dismod_data["nu"] = self.dismod_data.measure.apply( self.nu.__getitem__) # This makes the specs not just for the country covariate but adds on # the sex and one covariates. self.covariate_specs = CovariateSpecs( country_covariates=settings.country_covariate, study_covariates=settings.study_covariate ) self.country_covariate_data = {c.covariate_id: c.configure_for_dismod( pop_df=self.population.configure_for_dismod(), loc_df=self.location_dag.df ) for c in self.covariate_data} self.dismod_data = self.add_covariates_to_data(df=self.dismod_data) self.dismod_data.loc[ self.dismod_data.hold_out.isnull(), 'hold_out'] = 0. self.dismod_data.drop(['age_group_id'], inplace=True, axis=1) return self def prune_mortality_data(self, parent_location_id: int) -> pd.DataFrame: """ Remove mortality data for descendents that are not children of parent_location_id from the configured dismod data before it gets filled into the dismod database. """ df = self.dismod_data.copy() direct_children = self.location_dag.parent_children(parent_location_id) direct_children = df.location_id.isin(direct_children) mortality_measures = df.measure.isin([ IntegrandEnum.mtall.name, IntegrandEnum.mtspecific.name ]) remove_rows = ~direct_children & mortality_measures df = df.loc[~remove_rows].copy() return df def add_covariates_to_data(self, df: pd.DataFrame) -> pd.DataFrame: """ Add on covariates to a data frame that has age_group_id, year_id or time-age upper / lower, and location_id and sex_id. Adds both country-level and study-level covariates. """ cov_dict_for_interpolation = { c.name: self.country_covariate_data[c.covariate_id] for c in self.covariate_specs.covariate_specs if c.study_country == 'country' } df = self.interpolate_country_covariate_values( df=df, cov_dict=cov_dict_for_interpolation) df = self.transform_country_covariates(df=df) df['s_sex'] = df.sex_id.map( SEX_ID_TO_NAME).map(StudyCovConstants.SEX_COV_VALUE_MAP) df['s_one'] = StudyCovConstants.ONE_COV_VALUE return df def to_gbd_avgint(self, parent_location_id: int, sex_id: int) -> pd.DataFrame: """ Converts the demographics of the model to the avgint table. """ LOG.info(f"Getting grid for the avgint table " f"for parent location ID {parent_location_id} " f"and sex_id {sex_id}.") if self.drill_location_start is not None: locations = self.demographics.drill_locations else: locations = self.location_dag.parent_children(parent_location_id) grid = expand_grid({ 'sex_id': [sex_id], 'location_id': locations, 'year_id': self.demographics.year_id, 'age_group_id': self.demographics.age_group_id }) grid['time_lower'] = grid['year_id'].astype(int) grid['time_upper'] = grid['year_id'] + 1. grid = BaseInput( gbd_round_id=self.gbd_round_id).convert_to_age_lower_upper(df=grid) LOG.info("Adding covariates to avgint grid.") grid = self.add_covariates_to_data(df=grid) return grid def interpolate_country_covariate_values(self, df: pd.DataFrame, cov_dict: Dict[Union[float, str], pd.DataFrame]): """ Interpolates the covariate values onto the data so that the non-standard ages and years match up to meaningful covariate values. """ LOG.info(f"Interpolating and merging the country covariates.") interp_df = get_interpolated_covariate_values( data_df=df, covariate_dict=cov_dict, population_df=self.population.configure_for_dismod() ) return interp_df def transform_country_covariates(self, df): """ Transforms the covariate data with the transformation ID. :param df: (pd.DataFrame) :return: self """ for c in self.covariate_specs.covariate_specs: if c.study_country == 'country': LOG.info(f"Transforming the data for country covariate " f"{c.covariate_id}.") df[c.name] = df[c.name].apply( lambda x: COVARIATE_TRANSFORMS[c.transformation_id](x) ) return df def calculate_country_covariate_reference_values( self, parent_location_id: int, sex_id: int) -> CovariateSpecs: """ Gets the country covariate reference value for a covariate ID and a parent location ID. Also gets the maximum difference between the reference value and covariate values observed. Run this when you're going to make a DisMod AT database for a specific parent location and sex ID. :param: (int) :param parent_location_id: (int) :param sex_id: (int) :return: List[CovariateSpec] list of the covariate specs with the correct reference values and max diff. """ covariate_specs = copy(self.covariate_specs) age_min = self.dismod_data.age_lower.min() age_max = self.dismod_data.age_upper.max() time_min = self.dismod_data.time_lower.min() time_max = self.dismod_data.time_upper.max() children = self.location_dag.children(parent_location_id) for c in covariate_specs.covariate_specs: if c.study_country == 'study': if c.name == 's_sex': c.reference = StudyCovConstants.SEX_COV_VALUE_MAP[ SEX_ID_TO_NAME[sex_id]] c.max_difference = StudyCovConstants.MAX_DIFFERENCE_SEX_COV elif c.name == 's_one': c.reference = StudyCovConstants.ONE_COV_VALUE c.max_difference = StudyCovConstants.MAX_DIFFERENCE_ONE_COV else: raise ValueError(f"The only two study covariates allowed are sex and one, you tried {c.name}.") elif c.study_country == 'country': LOG.info(f"Calculating the reference and max difference for country covariate {c.covariate_id}.") cov_df = self.country_covariate_data[c.covariate_id] parent_df = ( cov_df.loc[cov_df.location_id == parent_location_id].copy() ) child_df = cov_df.loc[cov_df.location_id.isin(children)].copy() all_loc_df = pd.concat([child_df, parent_df], axis=0) # if there is no data for the parent location at all (which # there should be provided by Central Comp) # then we are going to set the reference value to 0. if cov_df.empty: reference_value = 0 max_difference = np.nan else: pop_df = self.population.configure_for_dismod() pop_df = ( pop_df.loc[pop_df.location_id == parent_location_id].copy() ) df_to_interp = pd.DataFrame({ 'location_id': parent_location_id, 'sex_id': [sex_id], 'age_lower': [age_min], 'age_upper': [age_max], 'time_lower': [time_min], 'time_upper': [time_max] }) reference_value = get_interpolated_covariate_values( data_df=df_to_interp, covariate_dict={c.name: parent_df}, population_df=pop_df )[c.name].iloc[0] max_difference = np.max( np.abs(all_loc_df.mean_value - reference_value) ) + CascadeConstants.PRECISION_FOR_REFERENCE_VALUES c.reference = reference_value c.max_difference = max_difference covariate_specs.create_covariate_list() return covariate_specs def reset_index(self, drop, inplace): pass
def dag(ihme): d = LocationDAG(location_set_version_id=544, gbd_round_id=6) return d
def construct_two_level_model(self, location_dag: LocationDAG, parent_location_id: int, covariate_specs: CovariateSpecs, weights: Optional[Dict[str, Var]] = None, omega_df: Optional[pd.DataFrame] = None, update_prior: Optional[Dict[str, Dict[str, np.ndarray]]] = None, min_cv: Optional[Dict[str, Dict[str, float]]] = None, update_mulcov_prior: Optional[Dict[Tuple[str, str, str], _Prior]] = None): """ Construct a Model object for a parent location and its children. Parameters ---------- location_dag Location DAG specifying the location hierarchy parent_location_id Parent location to build the model for covariate_specs covariate specifications, specifically will use covariate_specs.covariate_multipliers weights omega_df data frame with omega values in it (other cause mortality) update_prior dictionary of dictionary for prior updates to rates update_mulcov_prior dictionary of mulcov prior updates min_cv dictionary (can be defaultdict) for minimum coefficient of variation keyed by cascade level, then by rate """ children = location_dag.children(parent_location_id) cascade_level = str(location_dag.depth(parent_location_id)) # min_cv lookup expects a string key is_leaf = location_dag.is_leaf(parent_location_id) if is_leaf: cascade_level = MOST_DETAILED_CASCADE_LEVEL model = Model( nonzero_rates=self.settings.rate, parent_location=parent_location_id, child_location=children, covariates=covariate_specs.covariate_list, weights=weights ) # First construct the rate grid, and update with prior # information from a parent for value, dage, and dtime. for smooth in self.settings.rate: rate_grid = self.get_smoothing_grid(rate=smooth) if update_prior is not None: if smooth.rate in update_prior: self.override_priors(rate_grid=rate_grid, update_dict=update_prior[smooth.rate]) if min_cv is not None: self.apply_min_cv_to_prior_grid( prior_grid=rate_grid.value, min_cv=min_cv[cascade_level][smooth.rate] ) model.rate[smooth.rate] = rate_grid # Second construct the covariate grids for mulcov in covariate_specs.covariate_multipliers: grid = smooth_grid_from_smoothing_form( default_age_time=self.age_time_grid, single_age_time=self.single_age_time_grid, smooth=mulcov.grid_spec ) if update_mulcov_prior is not None and (mulcov.group, *mulcov.key) in update_mulcov_prior: ages = grid.ages times = grid.times for age, time in itertools.product(ages, times): lb = grid.value[age, time].lower ub = grid.value[age, time].upper update_mulcov_prior[(mulcov.group, *mulcov.key)].lower = lb update_mulcov_prior[(mulcov.group, *mulcov.key)].upper = ub grid.value[age, time] = update_mulcov_prior[(mulcov.group, *mulcov.key)] model[mulcov.group][mulcov.key] = grid # Construct the random effect grids, based on the parent location # specified. if self.settings.random_effect: random_effect_by_rate = defaultdict(list) for smooth in self.settings.random_effect: re_grid = smooth_grid_from_smoothing_form( default_age_time=self.age_time_grid, single_age_time=self.single_age_time_grid, smooth=smooth ) if not smooth.is_field_unset("location") and smooth.location in model.child_location: location = smooth.location else: location = None model.random_effect[(smooth.rate, location)] = re_grid random_effect_by_rate[smooth.rate].append(location) for rate_to_check, locations in random_effect_by_rate.items(): if locations != [None] and set(locations) != set(model.child_location): raise RuntimeError(f"Random effect for {rate_to_check} does not have " f"entries for all child locations, only {locations} " f"instead of {model.child_location}.") # Lastly, constrain omega for the parent and the random effects for the children. if self.settings.model.constrain_omega: LOG.info("Adding the omega constraint.") if omega_df is None: raise RuntimeError("Need an omega data frame in order to constrain omega.") parent_omega = omega_df.loc[omega_df.location_id == parent_location_id].copy() if parent_omega.empty: raise RuntimeError(f"No omega values for location {parent_location_id}.") omega = rectangular_data_to_var(gridded_data=parent_omega) model.rate["omega"] = constraint_from_rectangular_data( rate_var=omega, default_age_time=self.age_time_grid ) locations = set(omega_df.location_id.unique().tolist()) children_without_omega = set(children) - set(locations) if children_without_omega: LOG.warning(f"Children of {parent_location_id} missing omega {children_without_omega}" f"so not including child omega constraints") else: for child in children: child_omega = omega_df.loc[omega_df.location_id == child].copy() assert not child_omega.empty child_rate = rectangular_data_to_var(gridded_data=child_omega) def child_effect(age, time): return np.log(child_rate(age, time) / omega(age, time)) model.random_effect[("omega", child)] = constraint_from_rectangular_data( rate_var=child_effect, default_age_time=self.age_time_grid ) return model
def test_dag_from_df(df): dag = LocationDAG(df=df, root=1) assert set(dag.dag.successors(1)) == {2, 3} assert set(dag.dag.successors(2)) == {4, 5} assert set(dag.descendants(1)) == {2, 3, 4, 5}
def test_dag_error_missing_args(): with pytest.raises(LocationDAGError): LocationDAG(location_set_version_id=0)
def run(model_version_id: int, jobmon: bool = True, make: bool = True, n_sim: int = 10, addl_workflow_args: Optional[str] = None, skip_configure: bool = False) -> None: """ Runs the whole cascade or drill for a model version (which one is specified in the model version settings). Parameters ---------- model_version_id The model version to run jobmon Whether or not to use Jobmon. If not using Jobmon, executes the commands in sequence in this session. make Whether or not to make the directory structure for the databases, inputs, and outputs. n_sim Number of simulations to do going down the cascade addl_workflow_args skip_configure """ LOG.info(f"Starting model for {model_version_id}.") context = Context(model_version_id=model_version_id, make=make, configure_application=True) context.update_status(status='Submitted') settings = settings_from_model_version_id( model_version_id=model_version_id, conn_def=context.model_connection) dag = LocationDAG(location_set_version_id=settings.location_set_version_id, gbd_round_id=settings.gbd_round_id) if settings.model.drill == 'drill': cascade_command = Drill( model_version_id=model_version_id, drill_parent_location_id=settings.model.drill_location_start, drill_sex=settings.model.drill_sex) elif settings.model.drill == 'cascade': location_start = None sex = None if isinstance(settings.model.drill_location_start, int): location_start = settings.model.drill_location_start if isinstance(settings.model.drill_sex, int): sex = settings.model.drill_sex cascade_command = TraditionalCascade( model_version_id=model_version_id, split_sex=settings.model.split_sex == 'most_detailed', dag=dag, n_sim=n_sim, location_start=settings.model.drill_location_start, sex=sex, skip_configure=skip_configure) else: raise NotImplementedError( f"The drill/cascade setting {settings.model.drill} is not implemented." ) if jobmon: LOG.info("Configuring jobmon.") wf = jobmon_workflow_from_cascade_command( cc=cascade_command, context=context, addl_workflow_args=addl_workflow_args) error = wf.run() if error: context.update_status(status='Failed') raise RuntimeError("Jobmon workflow failed.") else: LOG.info("Running without jobmon.") for c in cascade_command.get_commands(): LOG.info(f"Running {c}.") process = subprocess.run(c, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if process.returncode: context.update_status(status='Failed') raise RuntimeError(f"Command {c} failed with error" f"{process.stderr.decode()}") context.update_status(status='Complete')
def __init__(self, model_version_id, gbd_round_id, decomp_step_id, csmr_process_version_id, csmr_cause_id, crosswalk_version_id, country_covariate_id, conn_def, location_set_version_id=None, drill=None): """ The class that constructs all of the measurement inputs. Pulls ASDR, CSMR, crosswalk versions, and country covariates, and puts them into one data frame that then formats itself for the dismod database. Performs covariate value interpolation if age and year ranges don't match up with GBD age and year ranges. Parameters: model_version_id: (int) the model version ID gbd_round_id: (int) the GBD round ID decomp_step_id: (int) the decomp step ID csmr_process_version_id: (int) process version ID for CSMR csmr_cause_id: (int) cause to pull CSMR from crosswalk_version_id: (int) crosswalk version to use country_covariate_id: (list of int) list of covariate IDs conn_def: (str) connection definition from .odbc file (e.g. 'epi') location_set_version_id: (int) can be None, if it's none, get the best location_set_version_id for estimation hierarchy of this GBD round. drill: (int) optional, which location ID to drill from as the parent Attributes: self.decomp_step: (str) the decomp step in string form self.demographics: (cascade_at.inputs.demographics.Demographics) a demographics object that specifies the age group, sex, location, and year IDs to grab self.integrand_map: (dict) dictionary mapping from GBD measure IDs to DisMod IDs self.asdr: (cascade_at.inputs.asdr.ASDR) all-cause mortality input object self.csmr: (cascade_at.inputs.csmr.CSMR) cause-specific mortality input object from cause csmr_cause_id self.data: (cascade_at.inputs.data.CrosswalkVersion) crosswalk version data from IHME database self.covariate_data: (List[cascade_at.inputs.covariate_data.CovariateData]) list of covariate data objects that contains the raw covariate data mapped to IDs self.location_dag: (cascade_at.inputs.locations.LocationDAG) DAG of locations to be used self.population: (cascade_at.inputs.population.Population) population object that is used for covariate weighting self.data_eta: (Dict[str, float]): dictionary of eta value to be applied to each measure self.density: (Dict[str, str]): dictionary of density to be applied to each measure self.nu: (Dict[str, float]): dictionary of nu value to be applied to each measure self.dismod_data: (pd.DataFrame) resulting dismod data formatted to be used in the dismod database Usage: >>> from cascade_at.settings.base_case import BASE_CASE >>> from cascade_at.settings.settings import load_settings >>> settings = load_settings(BASE_CASE) >>> covariate_ids = [i.country_covariate_id for i in settings.country_covariate] >>> i = MeasurementInputs(model_version_id=settings.model.model_version_id, >>> gbd_round_id=settings.gbd_round_id, >>> decomp_step_id=settings.model.decomp_step_id, >>> csmr_process_version_id=None, >>> csmr_cause_id = settings.model.add_csmr_cause, >>> crosswalk_version_id=settings.model.crosswalk_version_id, >>> country_covariate_id=covariate_ids, >>> conn_def='epi', >>> location_set_version_id=settings.location_set_version_id) >>> i.get_raw_inputs() >>> i.configure_inputs_for_dismod() """ LOG.info( f"Initializing input object for model version ID {model_version_id}." ) LOG.info(f"GBD Round ID {gbd_round_id}.") LOG.info(f"Pulling from connection {conn_def}.") self.model_version_id = model_version_id self.gbd_round_id = gbd_round_id self.decomp_step_id = decomp_step_id self.csmr_process_version_id = csmr_process_version_id self.csmr_cause_id = csmr_cause_id self.crosswalk_version_id = crosswalk_version_id self.country_covariate_id = country_covariate_id self.conn_def = conn_def self.decomp_step = ds.decomp_step_from_decomp_step_id( self.decomp_step_id) self.demographics = Demographics(gbd_round_id=self.gbd_round_id) if location_set_version_id is None: self.location_set_version_id = get_location_set_version_id( gbd_round_id=self.gbd_round_id) else: self.location_set_version_id = location_set_version_id self.location_dag = LocationDAG( location_set_version_id=self.location_set_version_id, gbd_round_id=self.gbd_round_id) if drill: LOG.info( f"This is a DRILL model, so only going to pull data associated with " f"drill location start {drill} and its descendants.") drill_descendants = list( self.location_dag.descendants(location_id=drill)) self.demographics.location_id = [drill] + drill_descendants self.exclude_outliers = True self.asdr = None self.csmr = None self.population = None self.data = None self.covariates = None self.age_groups = None self.data_eta = None self.density = None self.nu = None self.measures_to_exclude = None self.dismod_data = None self.covariate_data = None self.country_covariate_data = None self.covariate_specs = None self.omega = None
class MeasurementInputs: def __init__(self, model_version_id, gbd_round_id, decomp_step_id, csmr_process_version_id, csmr_cause_id, crosswalk_version_id, country_covariate_id, conn_def, location_set_version_id=None, drill=None): """ The class that constructs all of the measurement inputs. Pulls ASDR, CSMR, crosswalk versions, and country covariates, and puts them into one data frame that then formats itself for the dismod database. Performs covariate value interpolation if age and year ranges don't match up with GBD age and year ranges. Parameters: model_version_id: (int) the model version ID gbd_round_id: (int) the GBD round ID decomp_step_id: (int) the decomp step ID csmr_process_version_id: (int) process version ID for CSMR csmr_cause_id: (int) cause to pull CSMR from crosswalk_version_id: (int) crosswalk version to use country_covariate_id: (list of int) list of covariate IDs conn_def: (str) connection definition from .odbc file (e.g. 'epi') location_set_version_id: (int) can be None, if it's none, get the best location_set_version_id for estimation hierarchy of this GBD round. drill: (int) optional, which location ID to drill from as the parent Attributes: self.decomp_step: (str) the decomp step in string form self.demographics: (cascade_at.inputs.demographics.Demographics) a demographics object that specifies the age group, sex, location, and year IDs to grab self.integrand_map: (dict) dictionary mapping from GBD measure IDs to DisMod IDs self.asdr: (cascade_at.inputs.asdr.ASDR) all-cause mortality input object self.csmr: (cascade_at.inputs.csmr.CSMR) cause-specific mortality input object from cause csmr_cause_id self.data: (cascade_at.inputs.data.CrosswalkVersion) crosswalk version data from IHME database self.covariate_data: (List[cascade_at.inputs.covariate_data.CovariateData]) list of covariate data objects that contains the raw covariate data mapped to IDs self.location_dag: (cascade_at.inputs.locations.LocationDAG) DAG of locations to be used self.population: (cascade_at.inputs.population.Population) population object that is used for covariate weighting self.data_eta: (Dict[str, float]): dictionary of eta value to be applied to each measure self.density: (Dict[str, str]): dictionary of density to be applied to each measure self.nu: (Dict[str, float]): dictionary of nu value to be applied to each measure self.dismod_data: (pd.DataFrame) resulting dismod data formatted to be used in the dismod database Usage: >>> from cascade_at.settings.base_case import BASE_CASE >>> from cascade_at.settings.settings import load_settings >>> settings = load_settings(BASE_CASE) >>> covariate_ids = [i.country_covariate_id for i in settings.country_covariate] >>> i = MeasurementInputs(model_version_id=settings.model.model_version_id, >>> gbd_round_id=settings.gbd_round_id, >>> decomp_step_id=settings.model.decomp_step_id, >>> csmr_process_version_id=None, >>> csmr_cause_id = settings.model.add_csmr_cause, >>> crosswalk_version_id=settings.model.crosswalk_version_id, >>> country_covariate_id=covariate_ids, >>> conn_def='epi', >>> location_set_version_id=settings.location_set_version_id) >>> i.get_raw_inputs() >>> i.configure_inputs_for_dismod() """ LOG.info( f"Initializing input object for model version ID {model_version_id}." ) LOG.info(f"GBD Round ID {gbd_round_id}.") LOG.info(f"Pulling from connection {conn_def}.") self.model_version_id = model_version_id self.gbd_round_id = gbd_round_id self.decomp_step_id = decomp_step_id self.csmr_process_version_id = csmr_process_version_id self.csmr_cause_id = csmr_cause_id self.crosswalk_version_id = crosswalk_version_id self.country_covariate_id = country_covariate_id self.conn_def = conn_def self.decomp_step = ds.decomp_step_from_decomp_step_id( self.decomp_step_id) self.demographics = Demographics(gbd_round_id=self.gbd_round_id) if location_set_version_id is None: self.location_set_version_id = get_location_set_version_id( gbd_round_id=self.gbd_round_id) else: self.location_set_version_id = location_set_version_id self.location_dag = LocationDAG( location_set_version_id=self.location_set_version_id, gbd_round_id=self.gbd_round_id) if drill: LOG.info( f"This is a DRILL model, so only going to pull data associated with " f"drill location start {drill} and its descendants.") drill_descendants = list( self.location_dag.descendants(location_id=drill)) self.demographics.location_id = [drill] + drill_descendants self.exclude_outliers = True self.asdr = None self.csmr = None self.population = None self.data = None self.covariates = None self.age_groups = None self.data_eta = None self.density = None self.nu = None self.measures_to_exclude = None self.dismod_data = None self.covariate_data = None self.country_covariate_data = None self.covariate_specs = None self.omega = None def get_raw_inputs(self): """ Get the raw inputs that need to be used in the modeling. :return: """ LOG.info("Getting all raw inputs.") self.asdr = ASDR(demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id).get_raw() self.csmr = CSMR( cause_id=self.csmr_cause_id, demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id, process_version_id=self.csmr_process_version_id).get_raw() self.data = CrosswalkVersion( crosswalk_version_id=self.crosswalk_version_id, exclude_outliers=self.exclude_outliers, demographics=self.demographics, conn_def=self.conn_def, gbd_round_id=self.gbd_round_id).get_raw() self.covariate_data = [ CovariateData(covariate_id=c, demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id).get_raw() for c in self.country_covariate_id ] self.population = Population( demographics=self.demographics, decomp_step=self.decomp_step, gbd_round_id=self.gbd_round_id).get_population() def configure_inputs_for_dismod(self, settings, mortality_year_reduction=5): """ Modifies the inputs for DisMod based on model-specific settings. :param settings: (cascade.settings.configuration.Configuration) :param mortality_year_reduction: (int) number of years to decimate csmr and asdr :return: self """ self.data_eta = self.data_eta_from_settings(settings) self.density = self.density_from_settings(settings) self.nu = self.nu_from_settings(settings) self.measures_to_exclude = self.measures_to_exclude_from_settings( settings) # If we are constraining omega, then we want to hold out the data # from the DisMod fit for ASDR (but never CSMR -- always want to fit CSMR). data = self.data.configure_for_dismod( measures_to_exclude=self.measures_to_exclude, relabel_incidence=settings.model.relabel_incidence) asdr = self.asdr.configure_for_dismod( hold_out=settings.model.constrain_omega) csmr = self.csmr.configure_for_dismod(hold_out=0) if settings.model.constrain_omega: self.omega = self.calculate_omega(asdr=asdr, csmr=csmr) else: self.omega = None if not csmr.empty: csmr = decimate_years(data=csmr, num_years=mortality_year_reduction) if not asdr.empty: asdr = decimate_years(data=asdr, num_years=mortality_year_reduction) self.dismod_data = pd.concat([data, asdr, csmr], axis=0, sort=True) self.dismod_data.reset_index(drop=True, inplace=True) self.dismod_data["density"] = self.dismod_data.measure.apply( self.density.__getitem__) self.dismod_data["eta"] = self.dismod_data.measure.apply( self.data_eta.__getitem__) self.dismod_data["nu"] = self.dismod_data.measure.apply( self.nu.__getitem__) # This makes the specs not just for the country covariate but adds on the # sex and one covariates. self.covariate_specs = CovariateSpecs( country_covariates=settings.country_covariate, study_covariates=settings.study_covariate) self.country_covariate_data = { c.covariate_id: c.configure_for_dismod( pop_df=self.population.configure_for_dismod(), loc_df=self.location_dag.df) for c in self.covariate_data } self.dismod_data = self.add_covariates_to_data(df=self.dismod_data) self.dismod_data.loc[self.dismod_data.hold_out.isnull(), 'hold_out'] = 0. self.dismod_data.drop(['age_group_id'], inplace=True, axis=1) return self def add_covariates_to_data(self, df): """ Add on covariates to a data frame that has age_group_id, year_id or time-age upper / lower, and location_id and sex_id. Adds both country-level and study-level covariates. :return: """ cov_dict_for_interpolation = { c.name: self.country_covariate_data[c.covariate_id] for c in self.covariate_specs.covariate_specs if c.study_country == 'country' } df = self.interpolate_country_covariate_values( df=df, cov_dict=cov_dict_for_interpolation) df = self.transform_country_covariates(df=df) df['s_sex'] = df.sex_id.map(SEX_ID_TO_NAME).map( StudyCovConstants.SEX_COV_VALUE_MAP) df['s_one'] = StudyCovConstants.ONE_COV_VALUE return df def to_gbd_avgint(self, parent_location_id, sex_id): """ Converts the demographics of the model to the avgint table. :return: """ LOG.info(f"Getting grid for the avgint table " f"for parent location ID {parent_location_id} " f"and sex_id {sex_id}.") grid = expand_grid({ 'sex_id': [sex_id], 'location_id': self.location_dag.parent_children(parent_location_id), 'year_id': self.demographics.year_id, 'age_group_id': self.demographics.age_group_id }) grid['time_lower'] = grid['year_id'].astype(int) grid['time_upper'] = grid['year_id'] + 1. grid = BaseInput( gbd_round_id=self.gbd_round_id).convert_to_age_lower_upper(df=grid) LOG.info("Adding covariates to avgint grid.") grid = self.add_covariates_to_data(df=grid) return grid @staticmethod def calculate_omega(asdr, csmr): """ Calculates other cause mortality (omega) from ASDR (mtall -- all-cause mortality) and CSMR (mtspecific -- cause-specific mortality). For most diseases, mtall is a good approximation to omega, but we calculate omega = mtall - mtspecific in case it isn't. For diseases without CSMR (self.csmr_cause_id = None), then omega = mtall. """ join_columns = [ 'location_id', 'time_lower', 'time_upper', 'age_lower', 'age_upper', 'sex_id' ] mtall = asdr[join_columns + ['meas_value']].copy() mtall.rename(columns={'meas_value': 'mtall'}, inplace=True) if csmr.empty: omega = mtall.copy() omega.rename(columns={'mtall': 'mean'}, inplace=True) else: mtspecific = csmr[join_columns + ['meas_value']].copy() mtspecific.rename(columns={'meas_value': 'mtspecific'}, inplace=True) omega = mtall.merge(mtspecific, on=join_columns) omega['mean'] = omega['mtall'] - omega['mtspecific'] omega.drop(columns=['mtall', 'mtspecific'], inplace=True) negative_omega = omega['mean'] < 0 if any(negative_omega): raise ValueError("There are negative values for omega. Must fix.") return omega def interpolate_country_covariate_values(self, df, cov_dict): """ Interpolates the covariate values onto the data so that the non-standard ages and years match up to meaningful covariate values. :param df: (pd.DataFrame) :param cov_dict: (Dict) """ LOG.info(f"Interpolating and merging the country covariates.") interp_df = get_interpolated_covariate_values( data_df=df, covariate_dict=cov_dict, population_df=self.population.configure_for_dismod()) return interp_df def transform_country_covariates(self, df): """ Transforms the covariate data with the transformation ID. :param df: (pd.DataFrame) :return: self """ for c in self.covariate_specs.covariate_specs: if c.study_country == 'country': LOG.info( f"Transforming the data for country covariate {c.covariate_id}." ) df[c.name] = df[c.name].apply( lambda x: COVARIATE_TRANSFORMS[c.transformation_id](x)) return df def calculate_country_covariate_reference_values(self, parent_location_id, sex_id): """ Gets the country covariate reference value for a covariate ID and a parent location ID. Also gets the maximum difference between the reference value and covariate values observed. Run this when you're going to make a DisMod AT database for a specific parent location and sex ID. :param: (int) :param parent_location_id: (int) :param sex_id: (int) :return: List[CovariateSpec] list of the covariate specs with the correct reference values and max diff. """ covariate_specs = copy(self.covariate_specs) age_min = self.dismod_data.age_lower.min() age_max = self.dismod_data.age_upper.max() time_min = self.dismod_data.time_lower.min() time_max = self.dismod_data.time_upper.max() children = list(self.location_dag.dag.successors(parent_location_id)) for c in covariate_specs.covariate_specs: if c.study_country == 'study': if c.name == 's_sex': c.reference = StudyCovConstants.SEX_COV_VALUE_MAP[ SEX_ID_TO_NAME[sex_id]] c.max_difference = StudyCovConstants.MAX_DIFFERENCE_SEX_COV elif c.name == 's_one': c.reference = StudyCovConstants.ONE_COV_VALUE c.max_difference = StudyCovConstants.MAX_DIFFERENCE_ONE_COV else: raise ValueError( f"The only two study covariates allowed are sex and one, you tried {c.name}." ) elif c.study_country == 'country': LOG.info( f"Calculating the reference and max difference for country covariate {c.covariate_id}." ) cov_df = self.country_covariate_data[c.covariate_id] parent_df = cov_df.loc[cov_df.location_id == parent_location_id].copy() child_df = cov_df.loc[cov_df.location_id.isin(children)].copy() all_loc_df = pd.concat([child_df, parent_df], axis=0) # if there is no data for the parent location at all (which there should be provided by Central Comp) # then we are going to set the reference value to 0. if cov_df.empty: reference_value = 0 max_difference = np.nan else: pop_df = self.population.configure_for_dismod() pop_df = pop_df.loc[pop_df.location_id == parent_location_id].copy() df_to_interp = pd.DataFrame({ 'location_id': parent_location_id, 'sex_id': [sex_id], 'age_lower': [age_min], 'age_upper': [age_max], 'time_lower': [time_min], 'time_upper': [time_max] }) reference_value = get_interpolated_covariate_values( data_df=df_to_interp, covariate_dict={c.name: parent_df}, population_df=pop_df)[c.name].iloc[0] max_difference = np.max( np.abs(all_loc_df.mean_value - reference_value) ) + CascadeConstants.PRECISION_FOR_REFERENCE_VALUES c.reference = reference_value c.max_difference = max_difference covariate_specs.create_covariate_list() return covariate_specs @staticmethod def measures_to_exclude_from_settings(settings): """ Gets the measures to exclude from the data from the model settings configuration. :param settings: (cascade.settings.configuration.Configuration) :return: """ if not settings.model.is_field_unset("exclude_data_for_param"): measures_to_exclude = [ INTEGRAND_MAP[m].name for m in settings.model.exclude_data_for_param if m in INTEGRAND_MAP ] else: measures_to_exclude = list() if settings.policies.exclude_relative_risk: measures_to_exclude.append("relrisk") return measures_to_exclude @staticmethod def data_eta_from_settings(settings): """ Gets the data eta from the settings Configuration. The default data eta is np.nan. settings.eta.data: (Dict[str, float]): Default value for eta parameter on distributions as a dictionary from measure name to float :param settings: (cascade.settings.configuration.Configuration) :return: """ data_eta = defaultdict(lambda: np.nan) if not settings.eta.is_field_unset("data") and settings.eta.data: data_eta = defaultdict(lambda: float(settings.eta.data)) for set_eta in settings.data_eta_by_integrand: data_eta[INTEGRAND_MAP[set_eta.integrand_measure_id].name] = float( set_eta.value) return data_eta @staticmethod def density_from_settings(settings): """ Gets the density from the settings Configuration. The default density is "gaussian". settings.model.data_density: (Dict[str, float]): Default values for density parameter on distributions as a dictionary from measure name to string :param settings: (cascade.settings.configuration.Configuration) :return: """ density = defaultdict(lambda: "gaussian") if not settings.model.is_field_unset( "data_density") and settings.model.data_density: density = defaultdict(lambda: settings.model.data_density) for set_density in settings.data_density_by_integrand: density[INTEGRAND_MAP[ set_density.integrand_measure_id].name] = set_density.value return density @staticmethod def data_cv_from_settings(settings, default=0.0): """ Gets the data min coefficient of variation from the settings Configuration Args: settings: (cascade.settings.configuration.Configuration) default: (float) default data cv Returns: dictionary of data cv's from settings """ data_cv = defaultdict(lambda: default) if not settings.model.is_field_unset( "minimum_meas_cv") and settings.model.minimum_meas_cv: data_cv = defaultdict( lambda: float(settings.model.minimum_meas_cv)) for set_data_cv in settings.data_cv_by_integrand: data_cv[INTEGRAND_MAP[ set_data_cv.integrand_measure_id].name] = float( set_data_cv.value) return data_cv @staticmethod def nu_from_settings(settings): """ Gets nu from the settings Configuration. The default nu is np.nan. settings.students_dof.data: (Dict[str, float]): The parameter for students-t distributions settings.log_students_dof.data: (Dict[str, float]): The parameter for students-t distributions in log-space :param settings: (cascade.settings.configuration.Configuration) :return: """ nu = defaultdict(lambda: np.nan) nu["students"] = settings.students_dof.data nu["log_students"] = settings.log_students_dof.data return nu def reset_index(self, drop, inplace): pass
def l_dag(df): return LocationDAG(df=df, root=1)
def test_dag_no_root(df): with pytest.raises(LocationDAGError): LocationDAG(df=df)
def dag(df): return LocationDAG(df=df)
def run(model_version_id: int, jobmon: bool = True, make: bool = True, n_sim: int = 10, n_pool: int = 10, addl_workflow_args: Optional[str] = None, skip_configure: bool = False, json_file: Optional[str] = None, test_dir: Optional[str] = None, execute_dag: bool = True) -> None: """ Runs the whole cascade or drill for a model version (whichever one is specified in the model version settings). Creates a cascade command and a bunch of cascade operations based on the model version settings. More information on this structure is in :ref:`executor`. Parameters ---------- model_version_id The model version to run jobmon Whether or not to use Jobmon. If not using Jobmon, executes the commands in sequence in this session. make Whether or not to make the directory structure for the databases, inputs, and outputs. n_sim Number of simulations to do going down the cascade addl_workflow_args Additional workflow args to add to the jobmon workflow name so that it is unique if you're testing skip_configure Skip configuring the inputs because """ LOG.info(f"Starting model for {model_version_id}.") context = Context(model_version_id=model_version_id, make=make, configure_application=not skip_configure, root_directory=test_dir) context.update_status(status='Submitted') if json_file: with open(json_file) as fn: LOG.info(f"Reading settings from {json_file}") parameter_json = json.loads(fn.read()) settings = load_settings(parameter_json) # Save the json file as it is used throughout the cascade LOG.info(f"Replacing {context.settings_file}") context.write_inputs(settings=parameter_json) else: settings = settings_from_model_version_id( model_version_id=model_version_id, conn_def=context.model_connection) dag = LocationDAG(location_set_version_id=settings.location_set_version_id, gbd_round_id=settings.gbd_round_id) if settings.model.drill == 'drill': cascade_command = Drill( model_version_id=model_version_id, drill_parent_location_id=settings.model.drill_location_start, drill_sex=settings.model.drill_sex, n_sim=n_sim, n_pool=n_pool, skip_configure=skip_configure, ) elif settings.model.drill == 'cascade': location_start = None sex = None if isinstance(settings.model.drill_location_start, int): location_start = settings.model.drill_location_start if isinstance(settings.model.drill_sex, int): sex = settings.model.drill_sex cascade_command = TraditionalCascade( model_version_id=model_version_id, split_sex=settings.model.split_sex == 'most_detailed', dag=dag, n_sim=n_sim, n_pool=n_pool, location_start=settings.model.drill_location_start, sex=sex, skip_configure=skip_configure, ) else: raise NotImplementedError( f"The drill/cascade setting {settings.model.drill} is not implemented." ) dag_cmds_path = (context.inputs_dir / 'dag_commands.txt') LOG.info(f"Writing cascade dag commands to {dag_cmds_path}.") dag_cmds_path.write_text('\n'.join(cascade_command.get_commands())) if not execute_dag: return if jobmon: LOG.info("Configuring jobmon.") wf = jobmon_workflow_from_cascade_command( cc=cascade_command, context=context, addl_workflow_args=addl_workflow_args) wf_run = wf.run(seconds_until_timeout=60 * 60 * 24 * 3, resume=True) if wf_run.status != 'D': context.update_status(status='Failed') raise RuntimeError("Jobmon workflow failed.") else: LOG.info("Running without jobmon.") for c in cascade_command.get_commands(): LOG.info(f"Running {c}") process = subprocess.run(c, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if process.returncode: context.update_status(status='Failed') raise RuntimeError(f"Command {c} failed with error" f"{process.stderr.decode()}") if process.stderr: print(process.stderr.decode()) if process.stdout: print(process.stdout.decode()) context.update_status(status='Complete')