def run_phase(df, nid, extract_type_id, data_type_id, source, representative_id, code_system_id): """Prep source data by location, year, age, sex, and garbage level.""" # set caching configurator = Configurator('standard') cache_dir = configurator.get_directory('db_cache') cache_options = { 'block_rerun': True, 'cache_dir': cache_dir, 'force_rerun': False, 'cache_results': False } print_log_message("Pulling map from cause to detail level") detail_level_map = get_map_to_package_metadata(code_system_id) # merge incoming data with cause detail level df print("Merging detail level onto data") df = merge_with_detail_map(df, detail_level_map) print_log_message("Determining national coverage") # get national reprsentativeness # in stata we used whether or not there was national # or subnational in the source_type # but since we no longer have that then the best we can # do here is just use "representative_id" df = assign_nationally_representative( df, source, representative_id, data_type_id, cache_options ) print_log_message("Collapsing data.") df = df.groupby(["location_id", "year_id", "nid", "extract_type_id", "source", "data_type_id", "detail_level_id", "nationally_representative", "age_group_id", "sex_id"], as_index=False)["deaths"].sum() return df
def run_phase(df, cause_set_version_id, location_set_version_id, data_type_id, env_run_id, source, nid, extract_type_id, remove_decimal, code_map_version_id): """Run the full pipeline, chaining together CodProcesses.""" configurator = Configurator('standard') cache_dir = configurator.get_directory('db_cache') cache_options = { 'block_rerun': True, 'cache_dir': cache_dir, 'force_rerun': False, 'cache_results': False } # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **cache_options) # get location hierarchy location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **cache_options) # get envelope env_meta_df = get_env(env_run_id=env_run_id, **cache_options) # get env with HIV env_hiv_meta_df = get_env(env_run_id=env_run_id, with_hiv=True, **cache_options) # get age groups age_meta_df = get_ages(**cache_options) code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) cause_map = get_cause_map(code_map_version_id=code_map_version_id, **cache_options) package_map = get_package_map(code_system_id=code_system_id, **cache_options) disagg_df = get_phase_output("disaggregation", nid, extract_type_id) misdc_df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id) cause_package_hierarchy = get_cause_package_hierarchy(code_system_id) if source == "Cancer_Registry": df = prune_cancer_registry_data(df, location_meta_df) # aggregate location print_log_message("Aggregating location to country level") location_aggregator = LocationAggregator(df, location_meta_df) df = location_aggregator.get_computed_dataframe() if data_type_id in POLICE_SURVEY_DATA_TYPE: # special step to remove HIV from maternal data print_log_message("Removing HIV from cc_code for maternal data.") maternal_hiv_remover = MaternalHIVRemover(df, env_meta_df, env_hiv_meta_df, source, nid) df = maternal_hiv_remover.get_computed_dataframe() print_log_message("Calculating sample size") df = calc_sample_size(df) print_log_message(log_statistic(df)) print_log_message("Converting to cause fractions") df = df.loc[df['sample_size'] > 0] df = convert_to_cause_fractions( df, ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']) print_log_message(log_statistic(df)) if data_type_id == VA_DATA_TYPE: # run VA anemia adjusment print_log_message("Running VA Anemia adjustment") va_anemia_adjuster = AnemiaAdjuster() df = va_anemia_adjuster.get_computed_dataframe(df) if data_type_id in POLICE_SURVEY_DATA_TYPE: cause_list = df.cause_id.unique() square_me = (len(cause_list) == 2) & (CC_CODE in cause_list) if (source in MATERNAL_SQUARED) or square_me: print_log_message("Squaring maternal data") df = square_maternal_sources(df, cause_meta_df, age_meta_df) print_log_message("Dropping cc code") df = drop_cc_code(df) print_log_message(log_statistic(df)) print_log_message("Splitting locations.") env_loc_splitter = EnvelopeLocationSplitter(df, env_meta_df, source) df = env_loc_splitter.get_computed_dataframe() print_log_message(log_statistic(df)) # aggregate causes print_log_message("Aggregating causes") cause_aggregator = CauseAggregator(df, cause_meta_df, source) df = cause_aggregator.get_computed_dataframe() print_log_message(log_statistic(df)) print_log_message("Adding parnt-mapped garbage to aggregated causes") parent_gbg_adder = ParentMappedAggregatedGarbageAdder( nid, extract_type_id, source, cause_package_hierarchy, cause_meta_df, package_map, cause_map, remove_decimal, disagg_df, misdc_df) df = parent_gbg_adder.get_computed_dataframe(df) print_log_message("Applying hiv-prevalance in pregnancy adjustment to " "maternal deaths") hmp = HIVMatPAFs() df = hmp.get_computed_dataframe(df, cause_meta_df, location_meta_df) print_log_message(log_statistic(df)) print_log_message( "Removing HIV and shocks from cause fraction denominator") hiv_shock_remover = SampleSizeCauseRemover(cause_meta_df) df = hiv_shock_remover.get_computed_dataframe(df) print_log_message(log_statistic(df)) df = conform_one_like_cf_to_one(df) print_log_message("Verifying cause fractions not null between 0 and 1") assert_valid_cause_fractions(df) if dataset_has_redistribution_variance(data_type_id, source): # Determine the redistribution variance rdvar = RedistributionVarianceEstimator(nid, extract_type_id, cause_meta_df, remove_decimal, code_system_id, cause_map, package_map) df = rdvar.get_computed_dataframe(df) return df
class HIVMatPAFs(CodProcess): calc_cf_col = 'cf' all_cf_cols = ['cf', 'cf_raw', 'cf_corr', 'cf_rd'] def __init__(self): self.configurator = Configurator('standard') self.cache_dir = self.configurator.get_directory('db_cache') self.maternal_hiv_props_path = \ self.configurator.get_directory('maternal_hiv_props') # self.need_subnational_props = [51, 16, 86, 214, 165] def get_computed_dataframe(self, df, cause_meta_df, location_meta_df): restricted_maternal_df = \ self.restrict_to_maternal_data(df, cause_meta_df) if restricted_maternal_df is None: # nothing to do if there is no maternal data to adjust return df appended_pafs = self.append_maternal_pafs( restricted_maternal_df.year_id.unique()) # no longer need this step since new PAFs have been created # extra step to fix missing sub national proportions # appended_pafs = self.duplicate_national_props(appended_pafs, location_meta_df) merged_data = \ self.merge_data_and_proportions(restricted_maternal_df, appended_pafs) percent_maternal = self.generate_percentages(merged_data) split_maternal = self.generate_splits(percent_maternal) hiv_cfs = self.create_maternal_hiv_cfs(split_maternal) cleaned = self.clean_adjusted_data(hiv_cfs) final = \ self.append_adjusted_orig(df, restricted_maternal_df, cleaned) group_cols = [ col for col in final.columns if col not in self.all_cf_cols and col not in ['sample_size'] ] final = final.groupby(group_cols, as_index=False).agg({ 'sample_size': 'mean', 'cf': 'sum', 'cf_raw': 'sum', 'cf_corr': 'sum', 'cf_rd': 'sum' }) return final def restrict_to_maternal_data(self, df, cause_meta_df): """Restrict incoming dataframe to only maternal data.""" df = df.copy() # get age start and age end for maternal ages maternal_metadata = cause_meta_df.loc[cause_meta_df['cause_id'] == 366] age_start = maternal_metadata['yll_age_start'] assert len(age_start) == 1 age_start = age_start.iloc[0] age_end = maternal_metadata.yll_age_end assert len(age_end) == 1 age_end = age_end.iloc[0] data = add_age_metadata(df, add_cols=['simple_age'], merge_col='age_group_id', force_rerun=False, block_rerun=True, cache_results=False, cache_dir=self.cache_dir) data.rename(columns={'simple_age': 'age'}, inplace=True) maternal_data = data.loc[(df['cause_id'] == 366) & (data['age'] >= age_start) & (data['age'] <= age_end) & (data['sex_id'] == 2) & (data['year_id'] >= 1980)] maternal_data.drop('age', axis=1, inplace=True) if len(maternal_data) == 0: return None else: return maternal_data def append_maternal_pafs(self, years): """Read in proportions.""" props = pd.DataFrame() for year in years: year = int(year) props_path = "{}/maternal_hiv_props_{}.csv".format( self.maternal_hiv_props_path, year) data = pd.read_csv(props_path) props = props.append(data) props = props.rename(columns={'year': 'year_id'}) return props def duplicate_national_props(self, props_df, loc_df): """Duplicate national proportions and fill sub national proportions. Note: necessary in countries that we are now modeling sub nationally, but since we weren't before there aren't any sub national proportions for maternal hiv (yet). """ subnational = loc_df.loc[ loc_df['level'] > 3, ['location_id', 'parent_id', 'level', 'path_to_top_parent']] # Russia sub nationals are level 5 while other countries are level 4 subnational.loc[ subnational['level'] == 5, 'parent_id'] = \ subnational['path_to_top_parent'].str.split(',').str[3].astype(int) # only keep rows with the needed sub national locations subnational = subnational.loc[subnational['parent_id'].isin( self.need_subnational_props)] # drop level 4 sub national location_ids for Russia subnational = subnational.loc[~((subnational['parent_id'] == 62) & (subnational['level'] == 4))] subnational = subnational[['location_id', 'parent_id']] subnational.rename(columns={ 'location_id': 'child_location_id', 'parent_id': 'location_id' }, inplace=True) # create sub national maternal_hiv proportions from national subnational = props_df.merge(subnational, on='location_id') subnational.drop('location_id', axis=1, inplace=True) subnational.rename(columns={'child_location_id': 'location_id'}, inplace=True) props_df = pd.concat([props_df, subnational]) assert not props_df.duplicated().any(), 'please check maternal'\ ' proportions, there are duplicates' return props_df def merge_data_and_proportions(self, data, props): """Merge restricted maternal data and proportions.""" merged_data = data.merge(props, on=['location_id', 'age_group_id', 'year_id'], how='left') assert merged_data.notnull().values.all(), 'maternal proportions '\ 'were not successfully merged with incoming data' return merged_data def generate_percentages(self, df): """Create new 'pct_maternal column'. This is to prepare for calculating maternal hiv cause fractions """ df['pct_maternal'] = 1 - df['pct_hiv'] - df['pct_maternal_hiv'] df.loc[df['pct_maternal'].isnull(), 'pct_maternal'] = 1 df.loc[df['pct_hiv'].isnull(), 'pct_hiv'] = 0 df.loc[df['pct_maternal_hiv'].isnull(), 'pct_maternal_hiv'] = 0 assert all(x > 0 for x in df['pct_maternal']) assert df[['pct_maternal', 'pct_hiv', 'pct_maternal_hiv' ]].notnull().values.any(), 'there are missing percentages' assert all( abs(df['pct_maternal'] + df['pct_hiv'] + df['pct_maternal_hiv']) - 1) < .0001 # proportion of maternal that is aggravated by hiv # cannot be above 13% based on USERNAME's meta-analysis; otherwise # this would suggest the percentage of maternal deaths that were # hiv positive is >1 assert (df['pct_maternal_hiv_vr'] <= .13).all() # maternal_hiv should not yet exist assert not (df['cause_id'] == 741).any() return df ''' ''' def generate_splits(self, df): """Create a column to indicate how the data should be split. (depends on source type) """ df = add_nid_metadata( df, add_cols='data_type_id', block_rerun=True, cache_dir=self.cache_dir, force_rerun=False, ) df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1 df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0 df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1 df.loc[df['split_maternal'] == 0, 'pct_maternal_hiv'] = df['pct_maternal_hiv_vr'] df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0 df.drop('pct_maternal_hiv_vr', axis=1, inplace=True) return df def create_maternal_hiv_cfs(self, df): """Create cause fractions for maternal hiv.""" df = df.copy() maternal_hiv_df = df.copy() maternal_hiv_df['cf'] = maternal_hiv_df['cf'] * \ maternal_hiv_df['pct_maternal_hiv'] maternal_hiv_df['cause_id'] = 741 maternal_hiv_df['cf_raw'] = 0 maternal_hiv_df['cf_corr'] = 0 maternal_hiv_df['cf_rd'] = 0 maternal_df = df.copy() maternal_df['cf'] = maternal_df['cf'] * maternal_df['pct_maternal'] maternal_df['cause_id'] = 366 df = pd.concat([maternal_hiv_df, maternal_df], ignore_index=True) return df def clean_adjusted_data(self, df): """Clean up adjusted data to add on to the original dataset. Add maternal_hiv to maternal, keep the maternal_hiv, split_maternal 0 observations and call them maternal """ va_vr = df.loc[df['split_maternal'] == 0] if len(va_vr) > 0: assert set([741, 366]) == set(va_vr.cause_id.unique()) va_vr = va_vr.loc[va_vr['cause_id'] != 366] va_vr['cause_id'] = 366 df = pd.concat([df, va_vr], ignore_index=True) df = df.groupby([ 'nid', 'extract_type_id', 'location_id', 'year_id', 'site_id', 'age_group_id', 'sex_id', 'sample_size', 'cause_id' ], as_index=False)[self.all_cf_cols].sum() # it is possible that, using this method, cause fractions exceed 1. # this is meaningless and breaks noise reduction, so cap it # make sure that cf isn't something absurd, though assert (df['cf'] < 1.1).all() df.loc[df['cf'] > 1, 'cf'] = 1 return df def append_adjusted_orig(self, orig, maternal_data, adjusted): """Remove original maternal data and append on adjusted.""" data = orig.merge(maternal_data, how='left', indicator=True) data = data.loc[data['_merge'] != 'both'] data.drop('_merge', axis=1, inplace=True) data = data.append(adjusted, ignore_index=True) return data
class GBDCauseMapper(CodProcess): """Convert cause codes into cause_ids. Arguments: id_cols (list): data_col (list): unique_cols (list): Returns: df, a pandas DataFrame with addition of cause_id diag_df, a pandas DataFrame: assesses the difference between different mapping versions """ id_cols = ['nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'cause_id', 'code_id', 'site_id'] data_col = ['deaths'] unique_cols = ['nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'cause_id', 'code_id', 'site_id'] # These are acauses 'sub_total', and '_sb' unnecessary_causes = [920, 744] cache_dir = str() def __init__(self, cause_set_version_id, code_map): self.cg = Configurator("standard") self.cache_dir = self.cg.get_directory('db_cache') self.cause_set_version_id = cause_set_version_id self.code_map = code_map def get_computed_dataframe(self, df, code_system_id): # make special cause adjustments df = self.special_cause_reassignment(df, code_system_id) """Map code id to cause id.""" print_log_message("Merging with cause map") # get code metadata from a file already cached df = add_code_metadata( df, ['cause_id'], code_system_id, code_map=self.code_map ) report_if_merge_fail(df, 'cause_id', 'code_id') # Make sure the mappings are good! print("Asserting it's all good") self.assert_valid_mappings(df, code_system_id) df = self.drop_unnecessary_causes(df, self.unnecessary_causes) print("Collapsing") df = self.collapse_and_sum_by_deaths(df) return df def drop_unnecessary_causes(self, df, unnecessary_causes): # Drops causes set as unnecessary, subtotal and stillbirth df = df.copy() df = df[~df['cause_id'].isin(unnecessary_causes)] return df def special_cause_reassignment(self, df, code_system_id): """Replace the actual data cause under certain conditions. There are instances where a PI has good reason to believe that a certain group of deaths were assigned to the wrong cause, and it is known what cause to re-assign those deaths to. Implement here. This essentially allows mapping based on not just the cause and code system but based on other information like the location, NID, year, etc. It can also be used (sparingly) for hotfixes like changing all codes with values 'acause_digest_gastrititis' to be named 'acause_digest_gastritis'. Args: df (DataFrame): data with cause Returns: DataFrame: with any modifications """ cache_args = { 'force_rerun': False, 'block_rerun': True, 'cache_dir': 'standard', 'cache_results': False } # Some SRS codes get redistributed differently than # other ICD10 datasets df = add_nid_metadata( df, 'source', **cache_args ) if (df['source'] == "India_SRS_states_report").any(): print_log_message("Changing SRS codes to custom garbage groups") assert (df['source'] == "India_SRS_states_report").all() df = add_code_metadata( df, 'value', code_system_id=code_system_id, **cache_args ) custom_grbg = pd.read_csv( self.cg.get_resource("srs_custom_garbage_groups") ) custom_grbg = custom_grbg.query('active == 1') custom_grbg['value'] = custom_grbg['srs_custom_garbage_group'] custom_grbg = add_code_metadata( custom_grbg, 'code_id', code_system_id=code_system_id, merge_col='value', **cache_args ) custom_grbg = custom_grbg.rename( columns={'code_id': 'new_code_id'}) custom_grbg = custom_grbg[['package_id', 'new_code_id']] gp_dfs = [] for package_id in custom_grbg.package_id.unique(): # THIS QUERIES THE DATABASE - BUT THERE SHOULD NEVER BE A TON # OF SRS JOBS HAPPENING AT ONCE SO IT SHOULD BE OK gp_df = get_garbage_from_package( code_system_id, package_id, package_arg_type="package_id" ) assert len(gp_df) != 0, \ "Found 0 codes for package {}".format(package_id) gp_dfs.append(gp_df) gp_df = pd.concat(gp_dfs, ignore_index=True) gp_df = gp_df.merge(custom_grbg, how='left') report_if_merge_fail(gp_df, 'new_code_id', 'package_id') gp_df = gp_df[['value', 'new_code_id']] gp_df['value'] = gp_df['value'].str.strip() df = df.merge(gp_df, how='left', on='value') df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id'] df['code_id'] = df['code_id'].astype(int) df = df.drop(['new_code_id', 'value'], axis=1) df = df.drop('source', axis=1) china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2) # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail) five_dig_code = df['code_id'] == 13243 df.loc[ china_cdc_2008 & five_dig_code, 'code_id' ] = 13242 return df def collapse_and_sum_by_deaths(self, df): """Group by final columns, summing across deaths. Directly modifies the dataframe, keeping only the columns needed to move on to the next Claude step. Also includes an assertion that there are no duplicates. """ df = df.groupby(self.id_cols, as_index=False)[self.data_col].sum() self.assert_unique_cols_unique(df) return df def assert_valid_mappings(self, df, code_system_id): """Test that the mapping worked. Runs a suite of assertions to make sure that mapping was successful. Args: df (DataFrame): with at least code_id and cause_id Returns: None Raises: AssertionError: Any condition fails """ # add code value from cached code map print("Adding value") df = add_code_metadata( df, ['value'], code_system_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir ) report_if_merge_fail(df, 'value', 'code_id') # get acause from cached cause hierarchy print("Adding acause") df = add_cause_metadata( df, ['acause'], cause_set_version_id=self.cause_set_version_id, force_rerun=False, block_rerun=True, cache_dir=self.cache_dir ) report_if_merge_fail(df, 'acause', 'cause_id') # Test that all causes starting with 'acause_' are mapped correctly. # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd'). # 'acause__gc_X59' should be mapped to '_gc', etc. print("Checking implied acauses") check_df = df.loc[df['value'].str.startswith('acause_')] check_df['implied_acause'] = \ check_df['value'].str.replace('acause_', '', 1) check_df.loc[ check_df['value'].str.contains("acause__gc"), 'implied_acause' ] = "_gc" bad_df = check_df.loc[ check_df['acause'] != check_df['implied_acause'] ] if len(bad_df) > 0: bad_stuff = bad_df[['value', 'acause']].drop_duplicates() raise AssertionError( "These code values do not match their acause: " "\n{}".format(bad_stuff) ) print("Checking for bad values") # assert incorrect acauses are gone bad_acauses = ['acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug'] bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique() if len(bad_df) > 0: raise AssertionError( "Found these bad code values in the data: {}".format(bad_stuff) ) def assert_unique_cols_unique(self, df): """Test that columns that should uniquely identify the dataframe do.""" assert not df.duplicated(self.unique_cols).any()
class BridgeMapper(CodProcess): """Replace acauses with those in the bridge map. Arguments: source (str) cause_set_version_id (int) code_system (str) Returns: df, pandas DataFrame: only change is replacing some cause_ids diag_df, pandas DataFrame: shows which cause_ids have been changed """ id_cols = ['nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'cause_id', 'site_id'] val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw'] # data type id for verbal autopsy VA = 8 def __init__(self, source, cause_meta_df, code_system): self.source = source self.code_system = code_system self.conf = Configurator("standard") self.bridge_map_path = Path(self.conf.get_directory('bridge_maps')) self.cause_meta_df = cause_meta_df self.cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_results': False, 'cache_dir': 'standard' } def get_computed_dataframe(self, df): """Replace acauses with those in the bridge map.""" # VA sources are the only ones where this may not work # might need to split dataframe by data_type_id for bridge map df = add_nid_metadata(df, ['data_type_id'], **self.cache_options) has_verbal_autopsy = self.VA in df['data_type_id'].unique() df.drop(columns='data_type_id', inplace=True) if self.needs_bridging(has_verbal_autopsy): file_name = self.get_file_name(has_verbal_autopsy) map_df = pd.read_csv(self.bridge_map_path / file_name) map_df = map_df[['acause', 'bridge_code']] # add acause column to deaths data bridge_mapped = add_cause_metadata( df, ['acause'], merge_col='cause_id', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in somehow... bridge_mapped.loc[ bridge_mapped['cause_id'] == 606, 'acause' ] = 'gyne_femaleinfert' report_if_merge_fail(bridge_mapped, 'acause', 'cause_id') bridge_mapped.drop(['cause_id'], axis=1, inplace=True) # perform zz bridge code redistribution before other bridge mapping bridge_mapped = self.redistribute_zz_bridge_codes(bridge_mapped, map_df) bridge_mapped = bridge_mapped.merge( map_df, how='left', on='acause' ) bridge_mapped = self.acause_to_bridge_code(bridge_mapped) # bring cause_id back bridge_mapped = add_cause_metadata( bridge_mapped, ['cause_id'], merge_col='acause', cause_meta_df=self.cause_meta_df ) # hack, this cause_id snuck in bridge_mapped.loc[ bridge_mapped['acause'] == 'gyne_femaleinfert', 'cause_id' ] = 606 report_if_merge_fail(bridge_mapped, 'cause_id', 'acause') # output diagnostic dataframe self.diag_df = bridge_mapped # drop unnecessary columns bridge_mapped = self.clean_up(bridge_mapped) return bridge_mapped else: self.diag_df = df df = self.clean_up(df) return df def needs_bridging(self, has_verbal_autopsy): """ Check data type and code_system to see if the bridge map is needed. """ code_systems_to_bridge_map = [ "ICD9_detail", "ICD9_BTL", "ICD10_tabulated", "ICD8_detail", "ICD8A", "China_1991_2002", "India_SCD_states_rural", "India_MCCD_states_ICD10", "India_MCCD_states_ICD9", "India_SRS_states_report", "Russia_FMD_1989_1998", "ICD9_USSR_Tabulation", "INDEPTH_ICD10_VA", "India_Maharashtra_SCD", "India_CRS", "PHL_VSR_1999_2005" ] special_sources_to_bridge_map = [ "Russia_FMD_ICD9", "India_SRS_states_report", "India_MCCD_Orissa_ICD10" ] # not all VA sources use a bridge map... something to think about # in the future, but not necessary right now if has_verbal_autopsy | \ (self.code_system in code_systems_to_bridge_map) | \ (self.source in special_sources_to_bridge_map): # we need to use the bridge map! return True else: # we do not need to use the bridge map return False def get_file_name(self, has_verbal_autopsy): """Determine the file name needed based on the source or code system. Note: The default file name will be the name of the code system, with some exceptions. For some sources we have specified specific files to bridge map with, all other sources will use the file that matches its code_system. """ source_to_sheet = { "India_MCCD_Orissa_ICD10": "India_MCCD_states_ICD10", "India_MCCD_Delhi_ICD10": "India_MCCD_states_ICD10", "Thailand_Public_Health_Statistics": "ICD10_tabulated", "India_SRS_states_report": "India_SRS_states_report", "UKR_databank_ICD10_tab": "ICD10_tabulated", "Russia_FMD_ICD9": "Russia_FMD_1989_1998", } if has_verbal_autopsy and (self.source != 'India_SRS_states_report'): file_name = 'INDEPTH_ICD10_VA' else: file_name = source_to_sheet.get(self.source, self.code_system) return file_name + '.csv' def redistribute_zz_bridge_codes(self, df, map_df): """ A mini-redistribution, but only redistributes causes bridge mapped to zz codes """ grouping_cols = list(set(self.id_cols) - {'cause_id'}) start_deaths = {col: df.groupby(grouping_cols)[col].sum() for col in self.val_cols} zz_code_idxs = map_df['bridge_code'].str.startswith('ZZ-') # get the order to do the zz code redistribution in: # start on lowest level of hierarchy and work our way up zz_code_targets = (map_df .loc[zz_code_idxs, ['bridge_code']] .drop_duplicates() .assign(acause=lambda d: d['bridge_code'].str.replace('ZZ-', '_')) .merge(self.cause_meta_df, on='acause') .sort_values(['level', 'acause'], ascending=False) .loc[:, 'bridge_code'] .tolist() ) # don't distribute onto anything that maps to a zz code all_causes_to_zz_codes = set(map_df.loc[zz_code_idxs, 'acause']) for zz_code in zz_code_targets: child_cause_ids = get_all_related_causes(zz_code.strip().replace('ZZ-', '_'), self.cause_meta_df) child_causes = self.cause_meta_df.loc[ self.cause_meta_df['cause_id'].isin(child_cause_ids), 'acause'].tolist() acauses_to_redistribute = map_df.loc[map_df['bridge_code'] == zz_code, 'acause'] to_redistribute = df['acause'].isin(acauses_to_redistribute) valid_child_causes = set(child_causes) - all_causes_to_zz_codes print_log_message('Found ZZ code: {}, deaths: {}' .format(zz_code, df.loc[to_redistribute, 'deaths'].sum())) # distribute onto at least all combinations of these # this is to ensure everything in df[to_redistribute] # get weights values_to_include = { 'acause': valid_child_causes, } for col in grouping_cols: values_to_include[col] = df.loc[to_redistribute, col].unique() distributed = distribute(df[to_redistribute], based_on=df[df['acause'].isin(valid_child_causes)], distribute_over='acause', within=grouping_cols, value_col='deaths', values_to_include=values_to_include, base_value=0.001, # this is mostly arbitrary ) report_if_merge_fail(distributed, check_col='acause', merge_cols=grouping_cols) # what follows is an unfortunate side effect of having multiple value columns # in the data -- it makes the merging somewhat more involved than simply # appending distributed data to existing data # TODO: refactor this into a generic method in redistribution_utils df = df.merge(distributed[grouping_cols + ['acause', 'deaths']], how='outer', on=grouping_cols + ['acause'], suffixes=('', '_new'), ) # default to 0 deaths in all values where new variables / IDs (i.e. new causes) # are in the distributed data (right only) # and where distributed does not have data (i.e. other causes in original # data that weren't distributed onto) (left only) df[self.val_cols + ['deaths_new']] = df[self.val_cols + ['deaths_new']].fillna(0) # Set values that were distributed away from their cause to 0. # This has the effect of moving deaths away from one cause to another. df.loc[df['acause'].isin(acauses_to_redistribute), 'deaths'] = 0 # now add distributed data to old df['deaths'] += df['deaths_new'] df.drop(columns='deaths_new', inplace=True) # make sure deaths didn't move out of a nid-etid-site-location-year-sex-age group for col in self.val_cols: end_deaths = df.groupby(grouping_cols)[col].sum() assert np.allclose(start_deaths[col], end_deaths), \ "Dropped/added deaths during ZZ code redistribution: " + \ "start {}: {}, end {}: {}".format(col, start_deaths[col], col, end_deaths) return df def acause_to_bridge_code(self, df): """Replace the acause with the bridge code.""" # there might still be zz codes in the data because we aren't # performing zz code redistribution on the other value columns, # so if something is coded to i.e. _neo in the raw data, then # we keep it as _neo. df['acause'].update(df['bridge_code'].str.replace('ZZ-', '_')) return df def get_diagnostic_dataframe(self): """Return a diagnostic dataframe. Diagnostic dataframe shows all changes made due to bridge mapping. Maybe change this later to there is some sort of output. """ if self.diag_df is None: print("No run of get computed dataframe yet") else: return self.diag_df def clean_up(self, df): """Group rogue duplicates.""" df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum() return df
inplace=True) code_system_type_dict = {1: "ICD10", 6: "ICD9"} df["code_system"] = code_system_type_dict[code_system_id] df["garbage_level"] = "" df["package_description"] = df["package_description"].str.lower() df = df[list(mcod_map)] return df if __name__ == '__main__': mcod_map = pd.read_excel("{}/mcause_map.xlsx".format( CONF.get_directory("process_inputs"))) # archive the current map mcod_map.to_excel( f"{CONF.get_directory('process_inputs')}/_archive/mcause_map_2020_07_04.xlsx", index=False) # ger rid of current inj mapping mcod_map = mcod_map.loc[~mcod_map["package_description"].str.contains( "y34|x59|ncode|nn", flags=re.IGNORECASE, regex=True)] dfs = pd.DataFrame() for code_system_id in [1, 6]: df = prep_all_inj_codes(code_system_id, mcod_map) dfs = dfs.append(df, ignore_index=True)
class Recoder(CodProcess): """Move deaths from one thing to another based on expert opinon.""" id_cols = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'cause_id', 'site_id' ] val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw'] def __init__(self, cause_meta_df, source, code_system_id, data_type_id): self.source = source self.code_system_id = code_system_id self.data_type_id = data_type_id self.cause_meta_df = cause_meta_df self.conf = Configurator("standard") self.vr_indicators_path = self.conf.get_resource('vr_indicators') self.cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_results': False, 'cache_dir': self.conf.get_directory('db_cache') } def get_computed_dataframe(self, df): """Return computations.""" # this method is de-activated until we establish how data drops # will be executed (new preference is through not uploading them or # running them through noise reduction) # df = self.drop_low_quality_data(self.df) if 'data_type_id' not in df.columns: df = add_nid_metadata(df, "data_type_id", **self.cache_options) df = self.recode(df) df = self.conform_secret_causes(df) df = self.clean_up(df) return df def get_diagnostic_dataframe(self): """Return diagnostics.""" pass def recode_sids(self, df): # SIDS in under 4 star locations needs to be recoded to neonatal 02/26/18 path_to_4_stars_sheet = self.conf.get_resource("four_star_locations") four_five_star_locs = pd.read_csv(path_to_4_stars_sheet) four_five_star_locs = four_five_star_locs[['location_id']] four_five_star_locs = four_five_star_locs.location_id.unique() less_than_four_star = ~df['location_id'].isin(four_five_star_locs) is_sids = df['cause_id'] == 686 df.loc[is_sids & less_than_four_star, 'cause_id'] = 380 return df def clean_up(self, df): """Group rogue duplicates.""" df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum() return df def conform_secret_causes(self, df): """Remove secret causes and conform to reporting cause hierarchy.""" # replace parent_id = 723 if cause is "inj_suicide_pesti", # "inj_suicide_fire", "inj_suicide_hang") df = add_cause_metadata(df, add_cols=['secret_cause', 'parent_id'], cause_meta_df=self.cause_meta_df, **self.cache_options) injuries_replace_parents = [722, 720, 719] replaced_injuries = df['cause_id'].isin(injuries_replace_parents) df.loc[replaced_injuries, 'parent_id'] = 723 secret_causes = df['secret_cause'] == 1 not_cc_code = df['cause_id'] != 919 len_before = len(df) if df['parent_id'].isnull().values.any(): raise AssertionError('There are missing parent cause_ids') df.loc[secret_causes & not_cc_code, 'cause_id'] = df['parent_id'] len_after = len(df) if len_before != len_after: raise AssertionError( 'The length of the dataframe has changed from {} to {}'.format( len_before, len_after)) df.drop(['parent_id', 'secret_cause'], axis=1, inplace=True) return df def drop_leukemia_subtypes(self, df): """Remove leukemia subtypes deaths created by redistribution. Deaths that are created in redistribution for leukemia subtypes should be recoded to the parent leukemia. """ leuk_subtypes = get_all_related_causes('neo_leukemia', self.cause_meta_df) # remove parent leukemia cause_id leuk_subtypes.remove(487) df.loc[(df['cause_id'].isin(leuk_subtypes)) & (df['deaths_rd'] > 0) & (df['deaths_raw'] <= 0), 'cause_id'] = 487 return df def recode(self, df): """Recode based on expert judgement. """ cause_metadata_df = self.cause_meta_df cause_metadata_df = cause_metadata_df[[ "cause_id", "path_to_top_parent", "acause" ]] # recode ckd except for ckd_other to cong_other in neonates ckd_cause_ids = get_all_related_causes('ckd', cause_metadata_df) ckd_cause_ids.remove(593) ckd_less_other = df['cause_id'].isin(ckd_cause_ids) neonate = df['age_group_id'].isin([2, 3]) df.loc[ckd_less_other & neonate, 'cause_id'] = 652 # recode resp_copd, resp_asthma, resp_other, resp_interstitial to lri # in neonates resp_ids = [509, 515, 516, 520] is_cert_resp_causes = df['cause_id'].isin(resp_ids) # neonate already defined df.loc[is_cert_resp_causes & neonate, 'cause_id'] = 322 # recode resp_asthma to lri in perinates is_asthma = df['cause_id'] == 515 df.loc[is_asthma & (df['age_group_id'] == 4), 'cause_id'] = 322 # Drop any maternal cause below age 10 and above age 55 # (recode to cc_code) maternal_cause_ids = get_all_related_causes(366, cause_metadata_df) maternal_cause_ids = df['cause_id'].isin(maternal_cause_ids) # ages not in the maternal age range non_maternal_ages = np.logical_not(df['age_group_id'].isin( [7, 8, 9, 10, 11, 12, 13, 14, 15, 22])) df.loc[maternal_cause_ids & non_maternal_ages, 'cause_id'] = 919 # Drop alzheimers below age 40 to (recode to cc_code) # dementia cause_id = 543 alzheimers = df['cause_id'] == 543 under_40 = df['age_group_id'].isin(range(1, 13, 1)) df.loc[alzheimers & under_40, 'cause_id'] = 919 # Recode congenital causes to cc_code in ages over 70 # (stata: substr(acause, 1, 4) == "cong") cong_causes = get_all_related_causes('cong', cause_metadata_df) congenital = df['cause_id'].isin(cong_causes) over_70 = df['age_group_id'].isin([19, 20, 30, 31, 32, 235]) df.loc[congenital & over_70, "cause_id"] = 919 # Recode neonatal-aged hepatitis # (and all sub-causes) to neonatal_hemolytic # except ICD9_USSR_Tabulated and ICD10_tabulated # Recode neonatal-aged hepatitis (and all sub-causes) to neonatal # if source is ICD9_USSR_Tabulated or ICD10_tabulated hepatitis = get_all_related_causes(400, cause_metadata_df) hepatitis = df['cause_id'].isin(hepatitis) if self.code_system_id in [7, 9]: df.loc[hepatitis & neonate, "cause_id"] = 380 else: df.loc[hepatitis & neonate, "cause_id"] = 384 # inj_disaster_light to inj_othunintent 2/07/18 inj_disaster_light = df['cause_id'] == 984 df.loc[inj_disaster_light, 'cause_id'] = 716 # ckd diabetes type to ckd all but icd10 2/07/18 # added ICD9_detail to exception 5/15/18 if self.code_system_id not in [1, 6]: ckd_diabetes = df['cause_id'].isin([997, 998]) df.loc[ckd_diabetes, 'cause_id'] = 589 # Removing diabetes remap 7/2/2019 - want to use the results of the new # unspecified diabetes regression for everything # # diabetes subtypes to parent all but icd10 2/07/18 # # added ICD9_detail, ICD10_tab to exception 5/15/18 # if self.code_system_id not in [1, 6, 9]: # diabetes_subtypes = df['cause_id'].isin([975, 976]) # df.loc[diabetes_subtypes, 'cause_id'] = 587 # diabetes to type 1 under 15 everywhere 2/07/18 diabetes_type_2 = df['cause_id'] == 976 under_15 = df['age_group_id'] < 8 df.loc[diabetes_type_2 & under_15, 'cause_id'] = 975 # nutrition iron and iodine to zz every data 2/07/18 iron_or_iodine = df['cause_id'].isin([388, 390]) df.loc[iron_or_iodine, 'cause_id'] = 919 # cvd_ihd move to cong_heart in under one year 2/07/18 under_1 = df['age_group_id'] < 5 cvd_ihd = df['cause_id'] == 493 df.loc[cvd_ihd & under_1, 'cause_id'] = 643 if 686 in df.cause_id.unique(): df = self.recode_sids(df) # Need to map _neo, _mental, _infect # etc to cc code 2/07/18 df.loc[df.cause_id.isin([344, 409, 410, 542, 558, 669, 680, 961]), 'cause_id'] = 919 # usually we also have to map _inj to cc_code, but in some VA we have # other sources for splitting _inj we do not move to cc_code 3/26/2018 if self.data_type_id not in [6, 7, 8]: df.loc[df['cause_id'] == 687, 'cause_id'] = 919 # cvd_ihd to cvd_other in under age one to 14 years 2/07/18 bridge map one_to_14 = df['age_group_id'].isin([5, 6, 7]) cvd_ihd = df['cause_id'] == 493 df.loc[cvd_ihd & one_to_14, 'cause_id'] = 507 # TODO test if the distinction between this and the above is necessary, # e.g. would the bridge map already map neonatal_hemolytic to neonatal? # Do shared cancer recodes (previously in cancer_recodes.do) cancer_recodes = get_all_related_causes([ 411, 414, 423, 426, 429, 432, 435, 438, 441, 444, 450, 453, 456, 459, 462, 465, 468, 474, 486, 483 ], cause_metadata_df) cancer_recodes = df['cause_id'].isin(cancer_recodes) cancer_ages = df['age_group_id'].isin(range(2, 8, 1)) df.loc[cancer_recodes & cancer_ages, "cause_id"] = 489 not_icd10 = self.code_system_id != 1 neo_meso = df['cause_id'] == 483 df.loc[neo_meso & not_icd10, "cause_id"] = 489 # Recode digest_hernia to cc_code if source is Ethiopia_AAMSP # added Ethiopia_subnational_AAMSP in GBD2017 if self.source.endswith("AAMSP"): digest_hernia = df['cause_id'].isin([531]) df.loc[digest_hernia, "cause_id"] = 919 # in these years we split a garbage of homicide/suicide to # their causes proportionally, now we want to recode the years # that we don't want to use in the homicide/suicide model. if self.source == "Iran_Mohsen_special_ICD10": homicide_and_suicide = df['cause_id'].isin( [724, 725, 726, 727, 941, 718, 719, 720, 721, 722, 723]) bad_years = df['year_id'].isin(range(2007, 2015)) # _unintent df.loc[bad_years & homicide_and_suicide, "cause_id"] = 919 # Recode war subcauses to inj_homicide in Jamaica 2005 VR inj_war = get_all_related_causes(945, cause_metadata_df) is_inj_war = df['cause_id'].isin(inj_war) jamaica = df['location_id'] == 115 year_2005 = df['year_id'] == 2005 vr = df['data_type_id'] == 9 df.loc[is_inj_war & jamaica & year_2005 & vr, 'cause_id'] = 724 # Recode inj_mech_gun to inj_homicide for Jamaica 2006 VR # "In ICD10 2005 there a large number of deaths due to # homicides, but in 2006 many of these deaths have moved to # unintentional firearms. # 2006 is missing homicides deaths. USERNAME wants to move deaths from # unintentional firearms to homicides." inj_mech_gun = df['cause_id'] == 705 year_2006 = df['year_id'] == 2006 df.loc[inj_mech_gun & year_2006 & jamaica & vr, 'cause_id'] = 724 # Recode digest_ibd to digest for Suriname 2005-2012 ICD10 # "Because NR has a very bad effect on IBD in Surinam please recode all # of data from 1995-2012 (ICD10 ) for "digest_ibd" to "digest" in # Suriname and keep them in recoding list for every upload" # TODO should this be more years than just 2012? like all of ICD10? if self.source == "ICD10": digest_ibd = df['cause_id'] == 532 suriname = df['location_id'] == 118 year_1995_2012 = df['year_id'].isin(range(1995, 2013, 1)) df.loc[digest_ibd & suriname & year_1995_2012, 'cause_id'] = 526 # Recode endo_procedural to inj_homicide, writ-large # "GBD2013 HACK: USERNAME and USERNAME want Endo-procedural # to go to inj_medical just for this round. # In GBD2014 it will go to endo" endo_prodcedural = df['cause_id'] == 624 df.loc[endo_prodcedural, 'cause_id'] = 708 # Recode Schizophrenia to cc_code in Tibet - USERNAME's reason: # "Because have very bad effect in Noise Reduction" schizo = df['cause_id'] == 559 tibet = df['location_id'] == 518 df.loc[schizo & tibet, 'cause_id'] = 919 # Recode HIV and all sub-causes before 1980 to cc_code, writ-large hiv = get_all_related_causes(298, cause_metadata_df) hiv = df['cause_id'].isin(hiv) pre_1980 = df['year_id'] < 1980 df.loc[hiv & pre_1980, 'cause_id'] = 919 # Recode diabetes and all sub-causes to neonatal, if age is neonatal # "2-Any death assigned to Diabetes in neonatal period (age 0-28 days) # in all data format (Except ICD9 and ICD10 detail) including all MCCD, # DSP , Russia format, VA have to recode to the neonatal death" -USERNAME # TODO this should be an age restriction for GBD not a recode # TODO implement diabetes_causes = get_all_related_causes(587, cause_metadata_df) diabetes = df['cause_id'].isin(diabetes_causes) df.loc[neonate & diabetes, 'cause_id'] = 380 # Recode cvd_stroke and all subcauses to cvd # in Verbal Autopsy under 20 years # "Any death in VA and SCD that assigned to the Stroke # in under age 20 years have to recode to all CVD" # Not done in bridge map; stata code does this for all VA # despite SCD comment. under_20 = df['age_group_id'].isin(range(0, 8, 1)) stroke = get_all_related_causes('cvd_stroke', cause_metadata_df) stroke_deaths = df['cause_id'].isin(stroke) va = df['data_type_id'] == 8 # cvd cause_id is 491 df.loc[under_20 & stroke_deaths & va, 'cause_id'] = 491 # Recode inj_trans_road_pedal to cc_code if age over 95, for everything # USERNAME request 1/20/2017 "remove inj_trans_road_pedal for over # 95 in all countries and years" # TODO should this be an age restriction? questionable... over_95 = df['age_group_id'] == 235 inj_trans_road_pedal = df['cause_id'] == 691 df.loc[over_95 & inj_trans_road_pedal, 'cause_id'] = 919 # Recode mental_schizo to _mental everywhere # "USERNAME request 1/31/2017 to get rid of all mental_schizo as a cause # of death and map to _mental" # TODO implement # TODO should this be yld_only, then? questionable... # TODO if maintaining this, don't need restriction restricting # mental_schizo to cc_code in Tibet df.loc[schizo, 'cause_id'] = 919 # Recode msk and all sub-causes to cc_code in all VA # "USERNAME and USERNAME request 2/14/2017 "msk recode to cc_code for all # VA and SRS" # this is in the bridge map already # Recode cvd_pvd to cvd in Russia_FMD_1999_2011 # Russia 1999 2011 has a weird outlier for pvd, should be cvd according # to USERNAME 02/13/2017 # TODO implement if self.source == "Russia_FMD_1999_2011": cvd_pvd = df['cause_id'] == 502 df.loc[cvd_pvd, 'cause_id'] = 491 # USERNAME said to remove this following recode 2/26/2018 # # In all VR USERNAME wants to move mental_drug deaths in under 15 # # to unintentional poisoning. -USERNAME 7/8/2015 # # cause_id 562 (mental_drug_opioids) has different age restrictions, # # so recode it separately # mental_causes_no_op = df['cause_id'].isin( # [560, 561, 563, 564, 565, 566] # ) # mental_no_op_ages = df['age_group_id'].isin(range(2, 8, 1)) # df.loc[mental_causes_no_op & mental_no_op_ages & vr, 'cause_id'] = 700 # mental_op = df['cause_id'] == 562 # mental_op_ages = df['age_group_id'].isin([4, 5, 6, 7]) # df.loc[mental_op & mental_op_ages & vr, 'cause_id'] = 700 # Temp fix for self imposed redistribution error # move suicide and homicide in these years to cc_code if self.source == "Iran_Mohsen_special_ICD10": sui_homi_causes = [ 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 941 ] sui_homi = df['cause_id'].isin(sui_homi_causes) bad_years = df['year_id'].isin(range(2007, 2015)) df.loc[sui_homi & bad_years, 'cause_id'] = 919 # In India MCCD neonatal sepsis should only be in under 1 month if "India_MCCD" in self.source: non_neonates = np.logical_not(df['age_group_id'].isin([2, 3])) neonatal_sepsis = df['cause_id'].isin([]) df.loc[non_neonates & neonatal_sepsis, 'cause_id'] = 380 # In India_SCD_states_rural we are trying to get rid of all the # redistribution artifacts if self.source == "India_SCD_states_rural": warnings.warn("Implement SCD rd artifact recode") # Recoding state actor violence to war for proper schocks tracking # in ICD9btl & icd10 inj_war_execution > inj_war_war in Ecuador '80-'90 inj_war_execution = df['cause_id'] == 854 if self.source == "ICD9_BTL": ecuador = df['location_id'] == 122 year_1980_1990 = df['year_id'].isin(range(1980, 1991, 1)) df.loc[inj_war_execution & ecuador & year_1980_1990, 'cause_id'] = 855 # inj_war_execution > inj_war_war for BIH from 1985-91 bih = df['location_id'] == 44 year_1985_1991 = df['year_id'].isin( [1985, 1986, 1987, 1988, 1989, 1990, 1991]) df.loc[inj_war_execution & bih & year_1985_1991, 'cause_id'] = 855 # in icd9_btl there are cancer recodes to be implemented here warnings.warn("BTL cancer recode needed") if self.source == "ICD10": irq = df['location_id'] == 143 year_2008 = df['year_id'] == 2008 df.loc[inj_war_execution & year_2008 & irq, 'cause_id'] = 855 # USERNAME said cirrhosis and hepatitis in India SRS did not go very well (5/26/19) # "Move any death from SRS in the final stage due to cirrhosis to hepatitis in under 15 # Move 30% death from SRS in the final stage due to cirrhosis to hepatitis in between 15-24" if self.source == "India_SRS_states_report": # There should be no cirrhosis subtypes in SRS, but include them in case things change cirrhosis_ids = [521, 522, 523, 524, 971, 525] hepatitis_id = 400 # Under 15 under_15 = df['age_group_id'] < 8 cirrhosis = df['cause_id'].isin(cirrhosis_ids) df.loc[under_15 & cirrhosis, 'cause_id'] = hepatitis_id # 15-24 start_deaths = df[self.val_cols].sum(axis=0) # Create proportions to split split_df = pd.DataFrame() for age_group_id in [8, 9]: for cirrhosis_id in cirrhosis_ids: small_df = pd.DataFrame({ 'new_cause_id': [cirrhosis_id, hepatitis_id], 'pct': [0.70, 0.30] }) small_df['cause_id'] = cirrhosis_id small_df['age_group_id'] = age_group_id split_df = split_df.append(small_df, sort=True) # Merge in the proportions and split # Do not apply the split retroactively - can't take away deaths from # cirrhosis in earlier phases if they aren't there yet df = df.merge(split_df, how='left', on=['age_group_id', 'cause_id']) matches = df.new_cause_id.notnull() df.loc[matches, 'cause_id'] = df['new_cause_id'] df.loc[matches, 'deaths'] = df['deaths'] * df['pct'] for col in ['deaths_raw', 'deaths_corr', 'deaths_rd']: df.loc[matches & (df['new_cause_id'] == hepatitis_id), col] = 0 df.drop(["new_cause_id", "pct"], axis='columns', inplace=True) assert np.allclose(start_deaths, df[self.val_cols].sum(axis=0)) assert df.notnull().values.all() # USERNAMEFm says we should not have congenital in older age groups # in this study. USERNAME says that since congenital is created by the # redistribution of sepsis for this study: "Result of redistrbution on sepsis # have to be very low, if the problem is just this one drop result of redistribution # due to sepsis" # The larger question is if/when we should create causes in VA malawi_va_study = df['nid'] == 413649 congenital = df.cause_id.isin( get_all_related_causes('cong', cause_metadata_df)) df.loc[malawi_va_study & congenital, 'cause_id'] = 919 if self.source == "ICD9_detail": if ((df['location_id'] == 43) & (df['year_id'] == 1997)).any(): warnings.warn("Albania homicide recode needed") if self.source == "ICD9_USSR_Tabulated": warnings.warn("Missing some homicide fixes for TJK, ARM here.") df = self.drop_leukemia_subtypes(df) # mortuary, burial, self-reported COD, census/survey, # and tabulated hospital data should be reduced down to just # injuries, maternal, and cc_code if self.data_type_id in [1, 3, 5, 7]: maternal_causes = get_all_related_causes('maternal', cause_metadata_df) injury_causes = get_all_related_causes('_inj', cause_metadata_df) maternal = df['cause_id'].isin(maternal_causes) inj = df['cause_id'].isin(injury_causes) df.loc[~(maternal | inj), 'cause_id'] = 919 # for sibling history, we only want maternal and cc_code if self.data_type_id == 5: df.loc[~maternal, 'cause_id'] = 919 return df
def run_phase(df, cause_set_version_id, location_set_version_id, data_type_id, env_run_id, source, nid, extract_type_id, remove_decimal, code_map_version_id, iso3): """Run the full pipeline, chaining together CodProcesses.""" configurator = Configurator('standard') cache_dir = configurator.get_directory('db_cache') cache_options = { 'block_rerun': True, 'cache_dir': cache_dir, 'force_rerun': False, 'cache_results': False } # get cause hierarchy cause_meta_df = get_current_cause_hierarchy( cause_set_version_id=cause_set_version_id, **cache_options) # get location hierarchy location_meta_df = get_current_location_hierarchy( location_set_version_id=location_set_version_id, **cache_options) # get envelope env_meta_df = get_env(env_run_id=env_run_id, **cache_options) # get env with HIV env_hiv_meta_df = get_env(env_run_id=env_run_id, with_hiv=True, **cache_options) # get age groups age_meta_df = get_ages(**cache_options) code_system_id = int( get_value_from_nid(nid, 'code_system_id', extract_type_id=extract_type_id)) cause_map = get_cause_map(code_map_version_id=code_map_version_id, **cache_options) package_map = get_package_map(code_system_id=code_system_id, **cache_options) disagg_df = get_phase_output("disaggregation", nid, extract_type_id) misdc_df = get_phase_output("misdiagnosiscorrection", nid, extract_type_id) cause_package_hierarchy = get_cause_package_hierarchy( code_system_id, **cache_options) if source == "Cancer_Registry": df = prune_cancer_registry_data(df, location_meta_df) # aggregate location # defaults to simple location -> national aggregation # running full aggregation for India Survey data print_log_message("Aggregating location to country level") location_aggregator = LocationAggregator(df, location_meta_df) if (data_type_id == 7) & (iso3 == 'IND'): df = location_aggregator.get_computed_dataframe('full') else: df = location_aggregator.get_computed_dataframe() if data_type_id in POLICE_SURVEY_DATA_TYPE: # special step to remove HIV from maternal data print_log_message("Removing HIV from cc_code for maternal data.") maternal_hiv_remover = MaternalHIVRemover(df, env_meta_df, env_hiv_meta_df, source, nid) df = maternal_hiv_remover.get_computed_dataframe() print_log_message("Calculating sample size") df = calc_sample_size(df) print_log_message(log_statistic(df)) print_log_message("Converting to cause fractions") df = df.loc[df['sample_size'] > 0] df = convert_to_cause_fractions( df, ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw']) print_log_message(log_statistic(df)) if data_type_id == VA_DATA_TYPE: # run VA anemia adjusment print_log_message("Running VA Anemia adjustment") va_anemia_adjuster = AnemiaAdjuster() df = va_anemia_adjuster.get_computed_dataframe(df) if data_type_id == POLICE_DATA_TYPE: if source == 'Various_RTI': rti_adjuster = RTIAdjuster(df, cause_meta_df, age_meta_df, location_meta_df) df = rti_adjuster.get_computed_dataframe() if data_type_id in POLICE_SURVEY_DATA_TYPE: # issue: rows with > 0 sample size are dropped # most common in maternal data, but relevant anywhere # we have only cc_code and one other cause and there # are 0 deaths for the other cause for a given age/sex cause_list = df.cause_id.unique() square_me = (len(cause_list) == 2) & (CC_CODE in cause_list) if (source in MATERNAL_SQUARED) or square_me: print_log_message("Squaring maternal data") df = square_maternal_sources(df, cause_meta_df, age_meta_df) print_log_message("Dropping cc code") df = drop_cc_code(df) print_log_message(log_statistic(df)) print_log_message("Splitting locations.") env_loc_splitter = EnvelopeLocationSplitter(df, env_meta_df, source) df = env_loc_splitter.get_computed_dataframe() print_log_message(log_statistic(df)) # aggregate causes print_log_message("Aggregating causes") cause_aggregator = CauseAggregator(df, cause_meta_df, source) df = cause_aggregator.get_computed_dataframe() print_log_message(log_statistic(df)) print_log_message("Adding parnt-mapped garbage to aggregated causes") parent_gbg_adder = ParentMappedAggregatedGarbageAdder( nid, extract_type_id, source, cause_package_hierarchy, cause_meta_df, package_map, cause_map, remove_decimal, disagg_df, misdc_df) df = parent_gbg_adder.get_computed_dataframe(df) print_log_message("Applying hiv-prevalance in pregnancy adjustment to " "maternal deaths") hmp = HIVMatPAFs() df = hmp.get_computed_dataframe(df, cause_meta_df, location_meta_df) print_log_message(log_statistic(df)) # TO DO # ** In the recode step for BTL some cancer deaths were moved to the # cancer parent. The squaring step created 0's. Get rid of the 0's in # country-years the recode was previously applied to. print_log_message( "Removing HIV and shocks from cause fraction denominator") hiv_shock_remover = SampleSizeCauseRemover(cause_meta_df) df = hiv_shock_remover.get_computed_dataframe(df) print_log_message(log_statistic(df)) # not sure why we do this, but could use a comment of some kind. df = conform_one_like_cf_to_one(df) print_log_message("Verifying cause fractions not null between 0 and 1") assert_valid_cause_fractions(df) if dataset_has_redistribution_variance(data_type_id, source): # Determine the redistribution variance rdvar = RedistributionVarianceEstimator( nid, extract_type_id, cause_meta_df, remove_decimal, code_system_id, cause_map, package_map, code_map_version_id=code_map_version_id) df = rdvar.get_computed_dataframe(df, **cache_options) return df
get_current_location_hierarchy ) from cod_prep.utils import ( print_log_message, report_duplicates, report_if_merge_fail, cod_timestamp ) from cod_prep.claude.claude_io import get_claude_data, makedirs_safely from cod_prep.claude.configurator import Configurator from save_proportions_for_tableau import SharedPackage CONF = Configurator() MODEL_DATA_CODE_SYSTEMS = [1, 6] RDP_REG_DIR = CONF.get_directory('rdp_regressions') def get_package_code_ids(regression_specification, code_system_id): """Returns code_ids for garbage codes in package for given code system""" package_description = regression_specification[ 'package_descriptions' ][code_system_id] packages = get_package_list(code_system_id) package_id = packages.loc[ packages['package_description'] == package_description, 'package_id' ] assert len(package_id) == 1 package_id = package_id.iloc[0]
write to file for uploading """ CONF = Configurator('standard') # sources containing maternal deaths that are noise reduced MATERNAL_NR_SOURCES = [ "Mexico_BIRMM", "Maternal_report", "SUSENAS", "China_MMS", "China_Child", ] NR_DIR = CONF.get_directory('nr_process_data') def get_malaria_noise_reduction_model_result(malaria_model_group, launch_set_id): """Read in the csv with saved malaria model result.""" malaria_dfs = [] for model_group in malaria_model_group: if model_group != "NO_NR": malaria_filepath = "FILEPATH".format(nr=NR_DIR, model_group=model_group, lsid=launch_set_id) df = just_keep_trying(pd.read_csv, args=[malaria_filepath], max_tries=100, seconds_between_tries=6,
class Recoder(CodProcess): id_cols = [ 'nid', 'extract_type_id', 'location_id', 'year_id', 'age_group_id', 'sex_id', 'cause_id', 'site_id' ] val_cols = ['deaths', 'deaths_rd', 'deaths_corr', 'deaths_raw'] def __init__(self, cause_meta_df, source, code_system_id, data_type_id): self.source = source self.code_system_id = code_system_id self.data_type_id = data_type_id self.cause_meta_df = cause_meta_df self.conf = Configurator("standard") self.vr_indicators_path = self.conf.get_resource('vr_indicators') self.cache_options = { 'force_rerun': False, 'block_rerun': True, 'cache_results': False, 'cache_dir': self.conf.get_directory('db_cache') } def get_computed_dataframe(self, df): if 'data_type_id' not in df.columns: df = add_nid_metadata(df, "data_type_id", **self.cache_options) df = self.recode(df) df = self.conform_secret_causes(df) df = self.clean_up(df) return df def get_diagnostic_dataframe(self): """Return diagnostics.""" pass def recode_sids(self, df): path_to_4_stars_sheet = self.conf.get_resource("four_star_locations") four_five_star_locs = pd.read_csv(path_to_4_stars_sheet) four_five_star_locs = four_five_star_locs[['location_id']] four_five_star_locs = four_five_star_locs.location_id.unique() less_than_four_star = ~df['location_id'].isin(four_five_star_locs) is_sids = df['cause_id'] == 686 df.loc[is_sids & less_than_four_star, 'cause_id'] = 380 return df def clean_up(self, df): """Group rogue duplicates.""" df = df.groupby(self.id_cols, as_index=False)[self.val_cols].sum() return df def conform_secret_causes(self, df): df = add_cause_metadata(df, add_cols=['secret_cause', 'parent_id'], cause_meta_df=self.cause_meta_df, **self.cache_options) injuries_replace_parents = [722, 720, 719] replaced_injuries = df['cause_id'].isin(injuries_replace_parents) df.loc[replaced_injuries, 'parent_id'] = 723 secret_causes = df['secret_cause'] == 1 not_cc_code = df['cause_id'] != 919 len_before = len(df) if df['parent_id'].isnull().values.any(): raise AssertionError('There are missing parent cause_ids') df.loc[secret_causes & not_cc_code, 'cause_id'] = df['parent_id'] len_after = len(df) if len_before != len_after: raise AssertionError( 'The length of the dataframe has changed from {} to {}'.format( len_before, len_after)) df.drop(['parent_id', 'secret_cause'], axis=1, inplace=True) return df def drop_leukemia_subtypes(self, df): leuk_subtypes = get_all_related_causes('neo_leukemia', self.cause_meta_df) leuk_subtypes.remove(487) df.loc[(df['cause_id'].isin(leuk_subtypes)) & (df['deaths_rd'] > 0) & (df['deaths_raw'] <= 0), 'cause_id'] = 487 return df def recode(self, df): cause_metadata_df = self.cause_meta_df cause_metadata_df = cause_metadata_df[[ "cause_id", "path_to_top_parent", "acause" ]] ckd_cause_ids = get_all_related_causes('ckd', cause_metadata_df) ckd_cause_ids.remove(593) ckd_less_other = df['cause_id'].isin(ckd_cause_ids) neonate = df['age_group_id'].isin([2, 3]) df.loc[ckd_less_other & neonate, 'cause_id'] = 652 resp_ids = [509, 515, 516, 520] is_cert_resp_causes = df['cause_id'].isin(resp_ids) df.loc[is_cert_resp_causes & neonate, 'cause_id'] = 322 is_asthma = df['cause_id'] == 515 df.loc[is_asthma & (df['age_group_id'] == 4), 'cause_id'] = 322 maternal_cause_ids = get_all_related_causes(366, cause_metadata_df) maternal_cause_ids = df['cause_id'].isin(maternal_cause_ids) non_maternal_ages = np.logical_not(df['age_group_id'].isin( [7, 8, 9, 10, 11, 12, 13, 14, 15, 22])) df.loc[maternal_cause_ids & non_maternal_ages, 'cause_id'] = 919 alzheimers = df['cause_id'] == 543 under_40 = df['age_group_id'].isin(range(1, 13, 1)) df.loc[alzheimers & under_40, 'cause_id'] = 919 cong_causes = get_all_related_causes('cong', cause_metadata_df) congenital = df['cause_id'].isin(cong_causes) over_70 = df['age_group_id'].isin([19, 20, 30, 31, 32, 235]) df.loc[congenital & over_70, "cause_id"] = 919 hepatitis = get_all_related_causes(400, cause_metadata_df) hepatitis = df['cause_id'].isin(hepatitis) if self.code_system_id in [7, 9]: df.loc[hepatitis & neonate, "cause_id"] = 380 else: df.loc[hepatitis & neonate, "cause_id"] = 384 inj_disaster_light = df['cause_id'] == 984 df.loc[inj_disaster_light, 'cause_id'] = 716 if self.code_system_id not in [1, 6]: ckd_diabetes = df['cause_id'].isin([997, 998]) df.loc[ckd_diabetes, 'cause_id'] = 589 if self.code_system_id not in [1, 6, 9]: diabetes_subtypes = df['cause_id'].isin([975, 976]) df.loc[diabetes_subtypes, 'cause_id'] = 587 diabetes_type_2 = df['cause_id'] == 976 under_15 = df['age_group_id'] < 8 df.loc[diabetes_type_2 & under_15, 'cause_id'] = 975 iron_or_iodine = df['cause_id'].isin([388, 390]) df.loc[iron_or_iodine, 'cause_id'] = 919 under_1 = df['age_group_id'] < 5 cvd_ihd = df['cause_id'] == 493 df.loc[cvd_ihd & under_1, 'cause_id'] = 643 if 686 in df.cause_id.unique(): df = self.recode_sids(df) df.loc[df.cause_id.isin([344, 409, 410, 542, 558, 669, 680, 961]), 'cause_id'] = 919 if self.data_type_id not in [6, 7, 8]: df.loc[df['cause_id'] == 687, 'cause_id'] = 919 one_to_14 = df['age_group_id'].isin([5, 6, 7]) cvd_ihd = df['cause_id'] == 493 df.loc[cvd_ihd & one_to_14, 'cause_id'] = 507 cancer_recodes = get_all_related_causes([ 411, 414, 423, 426, 429, 432, 435, 438, 441, 444, 450, 453, 456, 459, 462, 465, 468, 474, 486, 483 ], cause_metadata_df) cancer_recodes = df['cause_id'].isin(cancer_recodes) cancer_ages = df['age_group_id'].isin(range(2, 8, 1)) df.loc[cancer_recodes & cancer_ages, "cause_id"] = 489 not_icd10 = self.code_system_id != 1 neo_meso = df['cause_id'] == 483 df.loc[neo_meso & not_icd10, "cause_id"] = 489 if self.source.endswith("AAMSP"): digest_hernia = df['cause_id'].isin([531]) df.loc[digest_hernia, "cause_id"] = 919 if self.source == "": homicide_and_suicide = df['cause_id'].isin( [724, 725, 726, 727, 941, 718, 719, 720, 721, 722, 723]) bad_years = df['year_id'].isin(range(2007, 2015)) # _unintent df.loc[bad_years & homicide_and_suicide, "cause_id"] = 919 inj_war = get_all_related_causes(945, cause_metadata_df) is_inj_war = df['cause_id'].isin(inj_war) jamaica = df['location_id'] == 115 year_2005 = df['year_id'] == 2005 vr = df['data_type_id'] == 9 df.loc[is_inj_war & jamaica & year_2005 & vr, 'cause_id'] = 724 inj_mech_gun = df['cause_id'] == 705 year_2006 = df['year_id'] == 2006 df.loc[inj_mech_gun & year_2006 & jamaica & vr, 'cause_id'] = 724 if self.source == "ICD10": digest_ibd = df['cause_id'] == 532 suriname = df['location_id'] == 118 year_1995_2012 = df['year_id'].isin(range(1995, 2013, 1)) df.loc[digest_ibd & suriname & year_1995_2012, 'cause_id'] = 526 endo_prodcedural = df['cause_id'] == 624 df.loc[endo_prodcedural, 'cause_id'] = 708 schizo = df['cause_id'] == 559 tibet = df['location_id'] == 518 df.loc[schizo & tibet, 'cause_id'] = 919 hiv = get_all_related_causes(298, cause_metadata_df) hiv = df['cause_id'].isin(hiv) pre_1980 = df['year_id'] < 1980 df.loc[hiv & pre_1980, 'cause_id'] = 919 diabetes_causes = get_all_related_causes(587, cause_metadata_df) diabetes = df['cause_id'].isin(diabetes_causes) df.loc[neonate & diabetes, 'cause_id'] = 380 under_20 = df['age_group_id'].isin(range(0, 8, 1)) stroke = get_all_related_causes('cvd_stroke', cause_metadata_df) stroke_deaths = df['cause_id'].isin(stroke) va = df['data_type_id'] == 8 df.loc[under_20 & stroke_deaths & va, 'cause_id'] = 491 over_95 = df['age_group_id'] == 235 inj_trans_road_pedal = df['cause_id'] == 691 df.loc[over_95 & inj_trans_road_pedal, 'cause_id'] = 919 df.loc[schizo, 'cause_id'] = 919 if self.source == "Russia_FMD_1999_2011": cvd_pvd = df['cause_id'] == 502 df.loc[cvd_pvd, 'cause_id'] = 491 if self.source == "": sui_homi_causes = [ 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 941 ] sui_homi = df['cause_id'].isin(sui_homi_causes) bad_years = df['year_id'].isin(range(2007, 2015)) df.loc[sui_homi & bad_years, 'cause_id'] = 919 if "India_MCCD" in self.source: non_neonates = np.logical_not(df['age_group_id'].isin([2, 3])) neonatal_sepsis = df['cause_id'].isin([]) df.loc[non_neonates & neonatal_sepsis, 'cause_id'] = 380 if self.source == "India_SCD_states_rural": warnings.warn("Implement SCD rd artifact recode") inj_war_execution = df['cause_id'] == 854 if self.source == "ICD9_BTL": ecuador = df['location_id'] == 122 year_1980_1990 = df['year_id'].isin(range(1980, 1991, 1)) df.loc[inj_war_execution & ecuador & year_1980_1990, 'cause_id'] = 855 bih = df['location_id'] == 44 year_1985_1991 = df['year_id'].isin( [1985, 1986, 1987, 1988, 1989, 1990, 1991]) df.loc[inj_war_execution & bih & year_1985_1991, 'cause_id'] = 855 warnings.warn("BTL cancer recode needed") if self.source == "ICD10": irq = df['location_id'] == 143 year_2008 = df['year_id'] == 2008 df.loc[inj_war_execution & year_2008 & irq, 'cause_id'] = 855 if self.source == "ICD9_detail": if ((df['location_id'] == 43) & (df['year_id'] == 1997)).any(): warnings.warn("Albania homicide recode needed") if self.source == "ICD9_USSR_Tabulated": warnings.warn("Missing some homicide fixes for TJK, ARM here.") df = self.drop_leukemia_subtypes(df) if self.data_type_id in [1, 3, 5, 7]: maternal_causes = get_all_related_causes('maternal', cause_metadata_df) injury_causes = get_all_related_causes('_inj', cause_metadata_df) maternal = df['cause_id'].isin(maternal_causes) inj = df['cause_id'].isin(injury_causes) df.loc[~(maternal | inj), 'cause_id'] = 919 if self.data_type_id == 5: df.loc[~maternal, 'cause_id'] = 919 return df
class HIVMatPAFs(CodProcess): calc_cf_col = 'cf' all_cf_cols = ['cf', 'cf_raw', 'cf_corr', 'cf_rd'] def __init__(self): self.configurator = Configurator('standard') self.cache_dir = self.configurator.get_directory('db_cache') self.maternal_hiv_props_path = \ self.configurator.get_directory('maternal_hiv_props') def get_computed_dataframe(self, df, cause_meta_df, location_meta_df): restricted_maternal_df = \ self.restrict_to_maternal_data(df, cause_meta_df) if restricted_maternal_df is None: return df appended_pafs = self.append_maternal_pafs( restricted_maternal_df.year_id.unique()) merged_data = \ self.merge_data_and_proportions(restricted_maternal_df, appended_pafs) percent_maternal = self.generate_percentages(merged_data) split_maternal = self.generate_splits(percent_maternal) hiv_cfs = self.create_maternal_hiv_cfs(split_maternal) cleaned = self.clean_adjusted_data(hiv_cfs) final = \ self.append_adjusted_orig(df, restricted_maternal_df, cleaned) group_cols = [ col for col in final.columns if col not in self.all_cf_cols and col not in ['sample_size'] ] final = final.groupby(group_cols, as_index=False).agg({ 'sample_size': 'mean', 'cf': 'sum', 'cf_raw': 'sum', 'cf_corr': 'sum', 'cf_rd': 'sum' }) return final def restrict_to_maternal_data(self, df, cause_meta_df): """Restrict incoming dataframe to only maternal data.""" df = df.copy() # get age start and age end for maternal ages maternal_metadata = cause_meta_df.loc[cause_meta_df['cause_id'] == 366] age_start = maternal_metadata['yll_age_start'] assert len(age_start) == 1 age_start = age_start.iloc[0] age_end = maternal_metadata.yll_age_end assert len(age_end) == 1 age_end = age_end.iloc[0] data = add_age_metadata(df, add_cols=['simple_age'], merge_col='age_group_id', force_rerun=False, block_rerun=True, cache_results=False, cache_dir=self.cache_dir) data.rename(columns={'simple_age': 'age'}, inplace=True) maternal_data = data.loc[(df['cause_id'] == 366) & (data['age'] >= age_start) & (data['age'] <= age_end) & (data['sex_id'] == 2) & (data['year_id'] >= 1980)] maternal_data.drop('age', axis=1, inplace=True) if len(maternal_data) == 0: return None else: return maternal_data def append_maternal_pafs(self, years): """Read in proportions.""" props = pd.DataFrame() for year in years: year = int(year) if os.path.isfile("{}maternal_hiv_props_{}.csv".format( self.maternal_hiv_props_path, year)): data = pd.read_csv("FILEPATH".format(year)) props = props.append(data) props = props.rename(columns={'year': 'year_id'}) return props def duplicate_national_props(self, props_df, loc_df): subnational = loc_df.loc[ loc_df['level'] > 3, ['location_id', 'parent_id', 'level', 'path_to_top_parent']] # Russia sub nationals are level 5 while other countries are level 4 subnational.loc[ subnational['level'] == 5, 'parent_id'] = \ subnational['path_to_top_parent'].str.split(',').str[3].astype(int) # only keep rows with the needed sub national locations subnational = subnational.loc[subnational['parent_id'].isin( self.need_subnational_props)] # drop level 4 sub national location_ids for Russia subnational = subnational.loc[~((subnational['parent_id'] == 62) & (subnational['level'] == 4))] subnational = subnational[['location_id', 'parent_id']] subnational.rename(columns={ 'location_id': 'child_location_id', 'parent_id': 'location_id' }, inplace=True) # create sub national maternal_hiv proportions from national subnational = props_df.merge(subnational, on='location_id') subnational.drop('location_id', axis=1, inplace=True) subnational.rename(columns={'child_location_id': 'location_id'}, inplace=True) props_df = pd.concat([props_df, subnational]) assert not props_df.duplicated().any(), 'please check maternal'\ ' proportions, there are duplicates' return props_df def merge_data_and_proportions(self, data, props): """Merge restricted maternal data and proportions.""" merged_data = data.merge(props, on=['location_id', 'age_group_id', 'year_id'], how='left') assert merged_data.notnull().values.any(), 'maternal proportions '\ 'were not successfully merged with incoming data' return merged_data def generate_percentages(self, df): df['pct_maternal'] = 1 - df['pct_hiv'] - df['pct_maternal_hiv'] df.loc[df['pct_maternal'].isnull(), 'pct_maternal'] = 1 df.loc[df['pct_hiv'].isnull(), 'pct_hiv'] = 0 df.loc[df['pct_maternal_hiv'].isnull(), 'pct_maternal_hiv'] = 0 assert all(x > 0 for x in df['pct_maternal']) assert df[['pct_maternal', 'pct_hiv', 'pct_maternal_hiv' ]].notnull().values.any(), 'there are missing percentages' assert all( abs(df['pct_maternal'] + df['pct_hiv'] + df['pct_maternal_hiv']) - 1) < .0001 assert (df['pct_maternal_hiv_vr'] <= .13).all() assert not (df['cause_id'] == 741).any() return df ''' ''' def generate_splits(self, df): df = add_nid_metadata( df, add_cols='data_type_id', block_rerun=True, cache_dir=self.cache_dir, force_rerun=False, ) df.loc[df['data_type_id'].isin([7, 5]), 'split_maternal'] = 1 df.loc[df['split_maternal'].isnull(), 'split_maternal'] = 0 df.loc[df['split_maternal'] == 0, 'pct_maternal'] = 1 df.loc[df['split_maternal'] == 0, 'pct_maternal_hiv'] = df['pct_maternal_hiv_vr'] df.loc[df['split_maternal'] == 0, 'pct_hiv'] = 0 df.drop('pct_maternal_hiv_vr', axis=1, inplace=True) return df def create_maternal_hiv_cfs(self, df): df = df.copy() maternal_hiv_df = df.copy() maternal_hiv_df['cf'] = maternal_hiv_df['cf'] * \ maternal_hiv_df['pct_maternal_hiv'] maternal_hiv_df['cause_id'] = 741 maternal_hiv_df['cf_raw'] = 0 maternal_hiv_df['cf_corr'] = 0 maternal_hiv_df['cf_rd'] = 0 maternal_df = df.copy() maternal_df['cf'] = maternal_df['cf'] * maternal_df['pct_maternal'] maternal_df['cause_id'] = 366 df = pd.concat([maternal_hiv_df, maternal_df], ignore_index=True) return df def clean_adjusted_data(self, df): va_vr = df.loc[df['split_maternal'] == 0] if len(va_vr) > 0: assert set([741, 366]) == set(va_vr.cause_id.unique()) va_vr = va_vr.loc[va_vr['cause_id'] != 366] va_vr['cause_id'] = 366 df = pd.concat([df, va_vr], ignore_index=True) df = df.groupby([ 'nid', 'extract_type_id', 'location_id', 'year_id', 'site_id', 'age_group_id', 'sex_id', 'sample_size', 'cause_id' ], as_index=False)[self.all_cf_cols].sum() assert (df['cf'] < 1.1).all() df.loc[df['cf'] > 1, 'cf'] = 1 return df def append_adjusted_orig(self, orig, maternal_data, adjusted): """Remove original maternal data and append on adjusted.""" data = orig.merge(maternal_data, how='left', indicator=True) data = data.loc[data['_merge'] != 'both'] data.drop('_merge', axis=1, inplace=True) data = data.append(adjusted, ignore_index=True) return data