コード例 #1
0
    def assert_df_is_square(self):
        """
		Assert that the dataframe has all locations and is square on age, sex, 
		and cause.
		
		Throws:
			AssertionError if not true
		"""
        df = self.mktscan_codcorr.copy()
        locations = get_location_metadata(location_set_id=35, gbd_round_id=4)\
                                           .query('level >= 3')[['location_id']]
        locations['join_col'] = 1
        causes = df.cause_id.drop_duplicates().reset_index()
        causes['join_col'] = 1
        sexes = df.sex_id.drop_duplicates().reset_index()
        sexes['join_col'] = 1
        ages = df.age_group_id.drop_duplicates().reset_index()
        ages['join_col'] = 1
        square = locations.merge(causes).merge(sexes).merge(ages)
        square = square.drop('join_col', axis=1)
        m = square.merge(df, how='inner')
        assert len(m) == len(square), \
               'the dataset is not square or is missing some locations'
コード例 #2
0
ファイル: gbd_hosp_prep.py プロジェクト: cheth-rowe/ihmexp
def map_to_country(df):
    """
    much of our location data is subnational, but we sometimes want to tally things by country
    this function will map from any subnational location id to its parent country
    """
    pre = df.shape[0]
    cols = df.shape[1]

    locs = get_location_metadata(location_set_id=35)
    countries = locs.loc[locs.location_type == 'admin0',
                         ['location_id', 'location_ascii_name']].copy()
    countries.columns = ["merge_loc", "country_name"]

    df = df.merge(locs[['location_id', 'path_to_top_parent']],
                  how='left',
                  on='location_id')
    df = pd.concat([df, df.path_to_top_parent.str.split(",", expand=True)],
                   axis=1)

    if df[3].isnull().any():
        warnings.warn(
            "There are locations missing from the loc set 35 hierarchy, I'm going to break"
        )

    df['merge_loc'] = df[3].astype(int)
    df = df.merge(countries, how='left', on='merge_loc')

    to_drop = ['path_to_top_parent', 0, 1, 2, 3, 4, 5, 6, 'merge_loc']

    to_drop = [d for d in to_drop if d in df.columns]
    df.drop(to_drop, axis=1, inplace=True)

    assert df.shape[0] == pre
    assert df.shape[1] == cols + 1
    assert df.country_name.isnull().sum() == 0,\
        "Something went wrong {}".format(df[df.country_name.isnull()])
    return df
コード例 #3
0
def run_cod_age_sex_splitting(df, conn_def, cause_set_version_id, pop_run_id):
    cause_metadata = get_cause_metadata(
        cause_set_version_id=cause_set_version_id)
    possible_causes = cause_metadata['cause_id'].unique().tolist()
    for cause_id in df['cause_id'].unique().tolist():
        assert cause_id in possible_causes, "Cause ID {} not in hierarchy".format(
            cause_id)
    loc_meta = get_location_metadata(gbd_round_id=6, location_set_id=21)
    possible_locs = loc_meta['location_id'].tolist()
    df = df.loc[df['location_id'].isin(possible_locs), :]
    df = df.loc[df['best'] > 0, :]
    df['hi_best_ratio'] = df['high'] / df['best']
    df['lo_best_ratio'] = df['low'] / df['best']

    df = df.reset_index(drop=True)
    df['unique_join'] = df.index
    df_merge_later = df.loc[:,
                            ['unique_join', 'hi_best_ratio', 'lo_best_ratio']]
    df = df.drop(labels=['high', 'low', 'hi_best_ratio', 'lo_best_ratio'],
                 axis=1)
    splitter = AgeSexSplitter(cause_set_version_id=cause_set_version_id,
                              pop_run_id=pop_run_id,
                              distribution_set_version_id=62,
                              id_cols=['unique_join'],
                              value_column='best')
    split_df = splitter.get_computed_dataframe(df=df,
                                               location_meta_df=loc_meta)
    split_df = pd.merge(left=split_df,
                        right=df_merge_later,
                        on=['unique_join'],
                        how='left')
    split_df['low'] = split_df['best'] * split_df['lo_best_ratio']
    split_df['high'] = split_df['best'] * split_df['hi_best_ratio']
    split_df = split_df.drop(
        labels=['unique_join', 'lo_best_ratio', 'hi_best_ratio'], axis=1)
    return split_df
コード例 #4
0
def copy_draws(draws_dir, meid):
     locs = []
     for f in glob.glob(f"{draws_dir}/{meid}/*.csv"):
         locs.append(int( f.rsplit("/")[-1][:-4] ) )
     study_locs = db.get_demographics("epi")["location_id"]
     loc_h = db.get_location_metadata(35)
     missing = [l for l in study_locs if l not in locs]

     zero_draws = pd.read_csv(f'{draws_dir}/{meid}/101.csv')
     draw_cols = zero_draws.columns[zero_draws.columns.str.contains("draw")]
     zero_draws[draw_cols] = zero_draws[draw_cols] * 0.0

     print(len(missing))
     for place in missing:
         if loc_h.loc[loc_h.location_id == place, "level"].values[0] == 3:
             zero_draws['location_id'] = place
             zero_draws.to_csv(f'{draws_dir}/{meid}/{place}.csv')
         elif loc_h.loc[loc_h.location_id == place, "level"].values[0] == 4:
             parent = loc_h.loc[loc_h.location_id == place, "parent_id"].values[0]
             draws = pd.read_csv(f'{draws_dir}/{meid}/{parent}.csv')
             draws['location_id'] = place
             draws.to_csv(f'{draws_dir}/{meid}/{place}.csv')
         print(place)
     return None
コード例 #5
0
ファイル: core.py プロジェクト: zhouxm4/ihme-modeling
def get_location_hierarchy(location_set_id):
    result_df = get_location_metadata(location_set_id)
    return result_df[[
        'location_id', 'location_name', 'path_to_top_parent', 'parent_id',
        'level', 'is_estimate', 'most_detailed', 'sort_order'
    ]]
コード例 #6
0
from db_queries import get_location_metadata
from fbd_core import YearRange, argparse
from fbd_core.file_interface import FBDPath, open_xr
from datetime import datetime

EXT_YEAR = 2095

#  Height for different cell types
CELL_HT = {"title": 3, "location": 1, "stage": 0, "data_cols": 2}

# dict: Python dictionary for mapping indentation levels to their
# corresponding cause levels. Used for formatting the 'Cause' column
# in the table.
INDENT_MAP = {0: "", 1: "  ", 2: "    ", 3: "      "}
# Query gbd shared tables and get locations needed
GBD_LOC_DF = get_location_metadata(gbd_round_id=5, location_set_id=35)


def check_locs_array(df):
    """Function used to find and programmatically add in those locations that
    are in the GBD database but not in UNPD and WITT data"""
    return df["location_id"].isin(GBD_LOC_DF["location_id"])


def floating_style(list_nums):
    """Convert the decimal point in the UI to a lancet style floating single
    decimal for both past and future data

    ARGS:
    list_nums (list):
        list containing numbers to be converted to floating style period
コード例 #7
0
import pandas as pd
import subprocess

from db_queries import get_location_metadata

locs = get_location_metadata(location_set_id=22)
locs = locs.location_id.unique().tolist()

covid = 261
covname = 'vita_supp'
meid = 2640
measid = 5

for loc in locs:
    job_name = "covariate_{}".format(loc)
    call = ('qsub -l mem_free=10.0G -pe multi_slot 5'
            ' -cwd -P PROJECT -o'
            ' FILEPATH'
            ' -e FILEPATH -N {0}'
            ' FILEPATH'
            ' dismod_to_cov.py'
            ' {1} {2} {3} {4} {5}'.format(job_name, str(int(loc)),
                                          str(int(measid)), str(int(meid)),
                                          str(int(covid)), str(covname)))
    subprocess.call(call, shell=True)
コード例 #8
0
def apply_corrections(df, run_id, cf_model_type):
    """
    Applies the marketscan correction factors to the hospital data at the
    icg level.  The corrections are merged on by 'age_start', 'sex_id',
    'icg_id' and 'location_id'.  Reads in the corrections from 3 static csv.

    Parameters:
        df: Pandas DataFrame
            Must be aggregated and collapsed to the icg level
        run_id: (int or str)
            Identifies which clinical run we're using, ie 1, 2, 'test'
    """

    assert "icg_id" in df.columns, "'icg_id' must exist."

    start_columns = df.columns


    if cf_model_type == 'rmodels':
        corr_files = glob.glob("FILEPATH"\
                               "FILEPATH".format(run_id))
        id_cols = ['age_start', 'sex_id', 'cf_location_id', 'icg_id', 'icg_name']
    elif cf_model_type == 'mr-brt':

        corr_files = glob.glob("FILEPATH".format(run_id))
        id_cols = ['age_start', 'sex_id', 'icg_id', 'icg_name']
    else:
        assert False, "{} is not a recognized correction factor type".format(cf_model_type)

    idx = -4

    assert corr_files, "There are no correction factor files"

    corr_list = []
    cf_names = []
    for f in corr_files:

        draw_name = os.path.basename(f)[:idx]

        draw_name = draw_name[5:]
        cf_names.append(draw_name)

        dat = pd.read_csv(f)


        dat.rename(columns={'mean_' + draw_name: draw_name}, inplace=True)
        if "Unnamed: 0" in dat.columns:
            dat.drop("Unnamed: 0", 1, inplace = True)
        pre_rows = dat.shape[0]

        assert dat.shape[0] == pre_rows, "The number of rows changed"


        if draw_name == 'prevalence' and cf_model_type == 'rmodels':
            locs = get_location_metadata(location_set_id = 35)
            locs = pd.concat([locs, locs.path_to_top_parent.str.split(",", expand=True)], axis=1)
            locs = locs[locs[3].notnull()]
            locs['cf_location_id'] = locs[3].astype(int)
            locs = locs[['cf_location_id', 'super_region_id']].drop_duplicates()

            dat.rename(columns={'cf_location_id': 'super_region_id'}, inplace=True)
            dat = dat.merge(locs, how='left', on='super_region_id')
            dat.drop('super_region_id', axis=1, inplace=True)

        corr_list.append(dat)

        del dat


    correction_factors = functools.reduce(lambda x, y:\
        pd.merge(x, y,
                 on=id_cols, how='outer'), corr_list)

    if 'sex' in correction_factors.columns:

        correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True)


    df = hosp_prep.group_id_start_end_switcher(df)


    id_cols = [f + "_id" if f == "sex" else f for f in id_cols]

    pre_shape = df.shape[0]


    locs = get_location_metadata(location_set_id=35)[['location_id', 'path_to_top_parent']]
    locs = pd.concat([locs, locs.path_to_top_parent.str.split(",", expand=True)], axis=1)
    locs = locs[locs[3].notnull()]
    locs['cf_location_id'] = locs[3].astype(int)
    locs = locs[['cf_location_id', 'location_id']]
    df = df.merge(locs, how='left', on='location_id')



    df = df.merge(correction_factors, how='left', on=id_cols)

    assert pre_shape == df.shape[0], ("You unexpectedly added rows while "
        "merging on the correction factors. Don't do that!")


    for col in ['super_region_id', 'model_prediction', 'cf_location_id']:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)





    for level in cf_names:
        df["mean_" + level] = df["mean_raw"] * df[level]


    df = hosp_prep.group_id_start_end_switcher(df)




    df.drop(cf_names, axis=1, inplace=True)

    assert set(start_columns).issubset(set(df.columns)), """
        Some columns that were present at the start are missing now"""

    return df
コード例 #9
0
def add_columns_and_upload(bundle_ids, out_dir, status_dir, in_path):
	""" Description:
			uploads data under each respective bundle ID and NID.

		Args:
			bundle_ids (list)
	"""
	
	# organize modelable entity IDs, bundle IDs, cause IDs, and NID
	IDs = pd.DataFrame({'bundle_id':[285, 286, 287, 288, 289, 290],
						'cause_id':[493, 498, 520, 492, 499, 385],
						'nid':[250478, 250479, 250480, 250481, 250482, 250483],
						'me_id':[2414, 2415, 2416, 2417, 2418, 2419],
						})
			   
	# Find the respective cause ID, NID, and ME ID for the given bundle ID/s.
	IDs = IDs.query('bundle_id in {}'.format(bundle_ids))
	cause_ids = IDs['cause_id'].tolist()
	nids = IDs['nid'].tolist()
	me_ids = IDs['me_id'].tolist()

	if not os.path.exists(out_dir):
		os.makedirs(out_dir)
	if not os.path.exists(status_dir):
		os.makedirs(status_dir)

	# Bring in the new HF proportion inputs.
	new_df = pd.read_csv('{in_path}heart_failure_target_props_subnat.csv'.format(
														in_path=in_path))													

	# Filter out Sub-Saharan Africa.
	super_regions = get_location_metadata(location_set_id=35)[['location_id','super_region_id']]
	new_df = new_df.merge(super_regions, on='location_id', how='inner')
	new_df = new_df.query('super_region_id != 166')
	new_df.drop('super_region_id', axis=1, inplace=True)
	
	count = 0
	for bundle_id, nid, cause_id, me_id in izip(bundle_ids, nids, cause_ids, me_ids):
		
		#### For debugging ####
		fix = True
		#fix = False
		#### For debugging ####
		
		print cause_id
		print nid
		print me_id
		
		if fix:
				
			# Fill in some necessary columns for the merge/replacement of pre-existing input data.
			new_inputs = new_df.query('cause_id == {cause_id}'.format(cause_id=cause_id))
			
			new_inputs.drop('cause_id', axis=1, inplace=True)
			new_inputs['measure_id'] = 18
			new_inputs['nid'] = nid
			new_inputs['bundle_id'] = bundle_id
			new_inputs.rename(columns={'hf_target_prop':'mean', 'std_err_adj':'standard_error'}, inplace=True)
			
			# recode age groups to age range
			q0 = ('SELECT age_group_id, age_group_years_start AS age_start, '
				  'age_group_years_end AS age_end '
				  'FROM shared.age_group')
			age_df = query(q0, conn_def="shared")
			age_df['age_end'] = age_df.apply(age_fix, axis=1)
			new_inputs = new_inputs.merge(age_df, on='age_group_id', how='inner')
			new_inputs.drop('age_group_id', axis=1,
						   inplace=True)
			
			# recode sex IDs to sex names
			new_inputs.rename(columns = {'sex_id':'sex'}, inplace=True)
			sexes = new_inputs['sex']
			sexes = sexes.apply(sex_fix)
			new_inputs['sex'] = sexes
			
			# set years (put year_end as 2015 for merging purposes, recode to 2016)
			new_inputs['year_start'] = 1990
			new_inputs['year_end'] = 2016
				
			# fill in "seqs"
			new_inputs = assign_row_nums(new_inputs, bundle_id, nid, me_id)
			
			# Write the upload sheet as an Excel sheet labeled "extraction" -- this is
			# the required format by the Epi Uploader 
			writer = pd.ExcelWriter('{out_dir}new_inputs_{bundle_id}.xlsx'.format(
																	out_dir=out_dir,
																	bundle_id=bundle_id), engine='xlsxwriter')		   
			new_inputs.to_excel(writer, sheet_name='extraction', index=False, encoding='utf-8')
			writer.save()
			
			print new_inputs.shape
			count += 1
			print "{0} main/composite etiology inputs ready to to {0} bundle ID".format(count)	
コード例 #10
0
def assign_row_nums(df, bundle_id, nid, me_id):
	"""Fills in missing seqs in input dataframe

		Args:
			df (object): pandas dataframe object of input data
			engine (object): ihme_databases class instance with dUSERt
				engine set
			me_id (int): modelable_entity_id for the dataset in memory

		Returns:
			Returns a copy of the dataframe with the seqs filled in,
			increment strarting from the max of the database for the given
			modelable_entity.
		"""
	
	# variable used for indicating if Epi database row deletion is necessary
	delete_rows = False
	
	# necessary columns
	needed_cols = ['year_start',
				   'year_end',
				   'age_start',
				   'age_end',
				   'sex',
				   'location_id',
				   'mean',
				   'standard_error',
				   'measure_id',
				   'nid',
				   'bundle_id']

	index_cols = ['location_id',
				  'year_start',
				  'year_end',
				  'age_start',
				  'age_end',
				  'sex',
				  'measure_id',
				  'nid',
				  'bundle_id']
	
	# time stamp for upload metadata
	Time = timestamp()
	
	# Query that pulls the data for the unique bundle ID	
	q = ('''SELECT seq, location_id, year_start, year_end, age_start, age_end, 
	sex_id, measure_id, nid, bundle_id 
	FROM epi.bundle_dismod 
	WHERE bundle_id={bundle_id} AND nid={nid};'''.format(bundle_id=bundle_id,
               	                                         nid=nid))
	
	# execute query
	data = query(q, conn_def="epi")
	
	# get row numbers for all rows including for other NIDS
	all_seqs = data['seq']
	
	# recode sex_id to be sex names
	data.rename(columns = {'sex_id':'sex'}, inplace=True)
	sexes = data['sex']
	sexes = sexes.apply(sex_fix)
	data['sex'] = sexes
	
	# recode year to be current latest year
	data['year_end'] = 2016
	
	# get location names
	locations_df = get_location_metadata(location_set_id=35)[['location_id',
	                                                          'location_name']]
	
	# if the data pulled has zero rows, then make new row numbers
	if len(data) == 0:
		#df['seq'] = range(1,len(df)+ 1)
		df['seq'] = np.nan
		
		# drop all unneeded columns
		df = df[needed_cols + ['seq']]
		#df = df[needed_cols]
		
		# append location names
		df = df.merge(locations_df, on='location_id', how='inner')
	else:
		# make a identifier "new" for the data to check the merge
		df['new'] = 1
		
		# perform an outer merge of the new data on old data on location_id, 
		# and year_start
		df = df.merge(data, on=index_cols, how='outer')
		
		# find all the rows where "seq" is null -- these are rows that need to be inserted
		null_df = df[df['seq'].isnull()]
		
		# find all the rows where "new" is null -- these are rows that need to be deleted
		no_match = df[df['new'].isnull()]
		
		print "LENGTH", len(df) 
		
		# take all the rows where all the index columns matched (the inner merge)
		# these are the rows to be updated
		df = df[(df['seq'].notnull())&(df['mean'].notnull())&(df['new'].notnull())]
		
		# drop all unnecessary columns
		df = df[needed_cols + ['seq']]
		
		print df.seq.unique()

		# if it wasn't a perfect merge -- if there are nulls, then
		print "LENGTH", len(null_df)
		print "LENGTH", len(no_match)
		print "LENGTH", len(df) 
		if len(null_df) != 0 or len(no_match) != 0:
			# drop the null row numbers (null "seq").
			null_df = null_df[needed_cols]
			
			# Append location name.
			null_df = null_df.merge(locations_df, on='location_id', how='inner')
			
			# If the number of rows to be deleted is greater than the number of rows that need to be
			# inserted then 
			if len(no_match) > len(null_df):
				# get the row numbers of the rows that need to be deleted, the leftovers rows will be replaced by
				# those to be inserted
				replace_seqs = no_match['seq'].tolist()[:len(null_df)]
				
				for seq in replace_seqs.seq.unique():
					assert seq not in df.seqs.unique(), "seq {} is a dupliacte.".format(seq)
				
				get_rid = no_match.query('seq not in {}'.format(replace_seqs)).copy()
				
				# make all columns of get_rid of empty except seq, and bundle ID
				get_rid['bundle_id'] = np.nan
				get_rid['nid'] = np.nan
				get_rid['location_id'] = np.nan
				get_rid['sex'] = np.nan
				get_rid['mean'] = np.nan
				get_rid['standard_error'] = np.nan
				get_rid['measure_id'] = np.nan
				get_rid['year_start'] = np.nan
				get_rid['year_end'] = np.nan
				get_rid['age_start'] = np.nan
				get_rid['age_end'] = np.nan
				get_rid['unit_type'] = np.nan
				get_rid['unit_type_value'] = np.nan
				get_rid['measure_issue'] = np.nan
				get_rid['uncertainty_type'] = np.nan
				get_rid['uncertainty_type_value'] = np.nan
				get_rid['extractor'] = np.nan
				get_rid['representative_name'] = np.nan
				get_rid['urbanicity_type'] = np.nan
				get_rid['response_rate'] = np.nan
				get_rid['sampling_type'] = np.nan
				get_rid['recall_type'] = np.nan
				get_rid['recall_type_value'] = np.nan
				get_rid['case_name'] = np.nan
				get_rid['case_definition'] = np.nan
				get_rid['case_diagnostics'] = np.nan
				get_rid['note_modeler'] = np.nan
				get_rid['cv_hospital'] = np.nan
				get_rid['cv_marketscan'] = np.nan
				get_rid['cv_low_income_hosp'] = np.nan
				get_rid['cv_high_income_hosp'] = np.nan
				get_rid['is_outlier'] = np.nan
				get_rid['cases'] = np.nan
				get_rid['measure'] = np.nan
				get_rid['sample_size'] = np.nan
				get_rid['effective_sample_size'] = np.nan
				get_rid['source_type'] = np.nan
				get_rid['underlying_nid'] = np.nan
				get_rid['input_type'] = np.nan
				get_rid['design_effect'] = np.nan
				get_rid['unit_value_as_published'] = np.nan	
				get_rid['date_inserted'] = np.nan
				get_rid['last_updated'] = np.nan
				get_rid['inserted_by'] = np.nan
				get_rid['last_updated_by'] = np.nan
				get_rid['upper'] = np.nan
				get_rid['lower'] = np.nan
				
				# flip the "delete rows" indicator to True
				delete_rows = True

			# otherwise the rows to be deleted are replaced until new rows need to be inserted entirely:
			# Make the row numbers blank
			else:
				null_df['seq'] = np.nan
				null_df.reset_index(inplace=True)
				null_df.drop('index', axis=1, inplace=True)
				replace_seqs = no_match['seq'].tolist()
				null_df.loc[0:len(replace_seqs)-1, 'seq'] = replace_seqs
			
			# and append them to those being updated
			df = df.append(null_df)

	# check if row nums assigned properly
	
	print len(df[df.seq.notnull()])
	print len(df[df.seq.isnull()])
	
	assert not any(df[df.seq.notnull()].seq.duplicated()), "Duplicate row numbers assigned"

	# fill in columns required by the Epi Uploader
	df['unit_type'] = "Person"
	df['unit_type_value'] = 2.0
	df['measure_issue'] = 0.0
	df['uncertainty_type'] = "Standard error"
	#df['uncertainty_type_id'] = 1
	df['uncertainty_type_value'] = np.nan
	df['extractor'] = "USER"
	df['representative_name'] = "Nationally and subnationally representative"
	df['urbanicity_type'] = "Unknown"
	df['response_rate'] = np.nan
	df['sampling_type'] = np.nan
	df['recall_type'] = "Point"
	df['recall_type_value'] = 1.0
	df['case_name'] = np.nan
	df['case_definition'] = np.nan
	df['case_diagnostics'] = np.nan
	df['note_modeler'] = 'Proportion generated from CODEm deaths using Marketscan data'
	df['cv_hospital'] = 0
	df['cv_marketscan'] = 1
	df['cv_low_income_hosp'] = 0
	df['cv_high_income_hosp'] = 0
	df['is_outlier'] = 0
	df['cases'] = np.nan
	df['measure'] = "proportion"
	df['sample_size'] = np.nan
	df['effective_sample_size'] = np.nan
	df['source_type'] = "Mixed or estimation"
	df['underlying_nid'] = np.nan
	df['input_type'] = "extracted"
	df['design_effect'] = np.nan
	df['unit_value_as_published'] = 1
	df['date_inserted'] = Time
	df['last_updated'] = Time
	df['inserted_by'] = "USERNAME"
	df['last_updated_by'] = "USERNAME"
	df['upper'] = np.nan
	df['lower'] = np.nan
	
	# Query the Epi database for modelable entity names
	q1 = '''SELECT modelable_entity_name
			FROM epi.modelable_entity
			WHERE modelable_entity_id={};'''.format(me_id)
	me_name = str(query(q1, conn_def="epi").loc[0,'modelable_entity_name'])
	
	df['modelable_entity_id'] = me_id
	df['modelable_entity_name'] = me_name
	
	# If the "delete rows" indicator is on,
	if delete_rows:
		# then append the rows set up to be deleted: leaving only NID, bundle ID, and seq (row number)
		df = df.append(get_rid)
	
	return df
コード例 #11
0
def run_master(root_dir,
               envr,
               sweep_lt,
               sweep_yld,
               sweep_hale,
               prep_lt,
               prep_yld,
               calc_hale,
               summarize,
               upload_hale,
               n_draws,
               loc_set_id,
               year_id,
               yld_version,
               local,
               test_location,
               custom_lt,
               log_dir='DIRECTORY'):
    ###############################################
    #Start jobmon and launch different jobs. Also
    #set up directories, and run get_population
    #to cache pop for compile_yld file
    ###############################################
    if not os.path.isdir(log_dir):
        os.mkdir(log_dir)

    if local:
        out_dir = root_dir
    else:
        out_dir = 'DIRECTORY'

    parameter_csv.run_param(envr,
                            yld_version,
                            loc_set_id,
                            year_id,
                            gbd_round_id=GBD_ROUND_ID)

    param_sheet = pd.read_csv('%s/inputs/parameters.csv' % root_dir)
    param_sheet = param_sheet.loc[param_sheet['status'] == 'best']

    hale_version = param_sheet['hale_version'].item()
    mort_version = param_sheet['mort_run'].item()
    print('HALE VERSION IS {}'.format(hale_version))
    print('MORT VERSION IS {}'.format(mort_version))
    print('YLD VERSION IS {}'.format(yld_version))

    prog_dir = '%s/v%s' % (out_dir, hale_version)
    draw_dir = '%s/draws' % prog_dir
    summ_dir = '%s/summaries' % prog_dir

    for direc in [prog_dir, draw_dir, summ_dir]:
        if not os.path.isdir(direc):
            os.mkdir(direc)
        os.chmod(direc, 0o777)

    if custom_lt is not None:
        lt_in = custom_lt
    else:
        lt_in = ("DIRECTORY")

    lt_tmp = '%s/lt' % draw_dir
    lt_dir = '%s/lt' % summ_dir
    yld_tmp = '%s/yld' % draw_dir
    yld_dir = '%s/yld' % summ_dir
    hale_tmp = '%s/results' % draw_dir
    hale_dir = '%s/results' % summ_dir

    sweep([lt_tmp, lt_dir], sweep_lt)
    sweep([yld_tmp, yld_dir], sweep_yld)
    sweep([hale_tmp, hale_dir], sweep_hale)

    err = glob('{}/*.e*'.format(log_dir))
    out = glob('{}/*.o*'.format(log_dir))
    ps = glob('{}/*.p*'.format(log_dir))
    for log in err + out + ps:
        os.remove(log)

    if test_location is not None:
        locations = [test_location]
    else:
        locations = []
        for location_set in loc_set_id:
            location_meta = get_location_metadata(location_set_id=location_set,
                                                  gbd_round_id=GBD_ROUND_ID)
            location_meta = location_meta.loc[
                location_meta['location_id'] != 44620]
            locs = location_meta['location_id'].unique().tolist()
            locations = locations + locs
        locations = list(set(locations))

    year_draws = list(zip(year_id, n_draws))

    d_str = "[%m/%d/%Y %H:%M:%S]"
    wf = Workflow('HALE_{}'.format(datetime.now().strftime(d_str)),
                  project='proj_hale',
                  stderr=log_dir,
                  stdout=log_dir)

    print('Building DAG')
    if prep_lt:
        lt_task = {}
        for location in locations:
            for year, draws in year_draws:
                args = [
                    '--lt_in', lt_in, '--lt_tmp', lt_tmp, '--location',
                    location, '--year', year, '--n_draws', draws
                ]
                script = os.path.join(root_dir, '01_compile_lt.py')
                name = 'lt_{}_{}_prep'.format(location, year)
                lt_task[(location, year)] = PythonTask(script=script,
                                                       args=args,
                                                       name=name,
                                                       slots=4,
                                                       mem_free=8,
                                                       max_attempts=3,
                                                       tag='lt_prep')
                wf.add_task(lt_task[(location, year)])

    if prep_yld:
        population = get_population(location_id=locations,
                                    year_id=year_id,
                                    age_group_id='all',
                                    sex_id='all',
                                    gbd_round_id=GBD_ROUND_ID)
        population.drop('run_id', axis=1, inplace=True)
        population.set_index('location_id', inplace=True)
        population.to_csv('%s/inputs/pop.csv' % root_dir)

        yld_task = {}
        for location in locations:
            for year, draws in year_draws:
                args = [
                    '--yld_tmp', yld_tmp, '--root_dir', root_dir, '--location',
                    location, '--yld_version', yld_version, '--year', year,
                    '--n_draws', draws
                ]
                script = os.path.join(root_dir, '02_compile_yld.py')
                name = 'yld_{}_{}_prep'.format(location, year)
                yld_task[(location, year)] = PythonTask(script=script,
                                                        args=args,
                                                        name=name,
                                                        slots=4,
                                                        mem_free=8,
                                                        max_attempts=3,
                                                        tag='yld_prep')
                wf.add_task(yld_task[(location, year)])

    if calc_hale:
        hale_task = {}
        for location in locations:
            for year in year_id:
                if prep_yld and prep_lt:
                    upstream_tasks = [
                        lt_task[(location, year)], yld_task[(location, year)]
                    ]
                elif prep_yld:
                    upstream_tasks = [yld_task[(location, year)]]
                elif prep_lt:
                    upstream_tasks = [lt_task[(location, year)]]
                else:
                    upstream_tasks = None
                args = [
                    '--hale_tmp', hale_tmp, '--lt_tmp', lt_tmp, '--yld_tmp',
                    yld_tmp, '--location', location, '--year', year
                ]
                script = os.path.join(root_dir, '03_calc_hale.py')
                name = 'hale_{}_{}_calc'.format(location, year)
                hale_task[(location,
                           year)] = PythonTask(script=script,
                                               args=args,
                                               name=name,
                                               slots=4,
                                               mem_free=8,
                                               max_attempts=3,
                                               tag='hale_calc',
                                               upstream_tasks=upstream_tasks)
                wf.add_task(hale_task[(location, year)])

    if summarize:
        summary_task = {}
        for location in locations:
            if calc_hale:
                upstream_tasks = [
                    hale_task[(location, year)] for year in year_id
                ]
            else:
                upstream_tasks = None
            args = [
                '--lt_tmp', lt_tmp, '--lt_dir', lt_dir, '--yld_tmp', yld_tmp,
                '--yld_dir', yld_dir, '--hale_tmp', hale_tmp, '--hale_dir',
                hale_dir, '--location', location
            ]
            script = os.path.join(root_dir, '04_calc_summaries.py')
            name = 'summary_{}_calc'.format(location)
            summary_task[location] = PythonTask(script=script,
                                                args=args,
                                                name=name,
                                                slots=4,
                                                mem_free=8,
                                                max_attempts=3,
                                                tag='summarize',
                                                upstream_tasks=upstream_tasks)
            wf.add_task(summary_task[location])

    if upload_hale:
        if summarize:
            upstream_tasks = [summary_task[loc] for loc in locations]
        else:
            upstream_tasks = None
        args = [
            '--hale_version', hale_version, '--hale_dir', hale_dir, '--envr',
            envr
        ]
        script = os.path.join(root_dir, '05_upload_hale.py')
        name = 'upload_hale'
        upload_task = PythonTask(script=script,
                                 args=args,
                                 name=name,
                                 slots=12,
                                 mem_free=24,
                                 max_attempts=3,
                                 tag='upload',
                                 upstream_tasks=upstream_tasks)
        wf.add_task(upload_task)

    print("executing workflow")
    integer_result = wf.execute()
    if integer_result:
        raise RuntimeError("Workflow failure")
    print("FINISHED")
コード例 #12
0
ファイル: 00_run_all.py プロジェクト: zhouxm4/ihme-modeling
                          process_timeout=process_timeout,
                          path_to_python_binary=path_to_python_binary,
                          upstream_tasks=upstream_tasks)


# Set up logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Start up DAG
d = datetime.datetime.now()
dag_name = "mort_u5_{}_{}".format(version_id, d.strftime("%Y%m%d%H%M"))
dag = TaskDag(name=dag_name)

# Get locations
location_hierarchy = get_location_metadata(location_set_id=21, gbd_round_id=5)
location_hierarchy = location_hierarchy.loc[
    (location_hierarchy['level'] >= 3)
    & (location_hierarchy['location_id'] != 6)]
ihme_loc_dict = make_ihme_loc_id_dict(location_hierarchy)

all_files = glob.glob((draws_dir + "*").format(version_id))

# Create tasks
u5_tasks = {}
for location_id in location_hierarchy['location_id'].tolist():
    output_file = (draws_dir + "{}.csv").format(version_id, location_id)
    if output_file not in all_files:
        print(output_file)
        ihme_loc_id = ihme_loc_dict[location_id]
        u5_tasks[location_id] = generate_u5_task(location_id, ihme_loc_id,
コード例 #13
0
    args = vars(parser.parse_args())

    params_dir = args["params_dir"]
    draws_dir = args["draws_dir"]
    interms_dir = args["interms_dir"]
    logs_dir = args["logs_dir"]
else:
    params_dir  = f"{data_root}/{cause}/FILEPATH"
    draws_dir   = f"{data_root}/{cause}/FILEPATH"
    interms_dir = f"{data_root}/{cause}/FILEPATH"
    logs_dir    = f"{data_root}/{cause}/FILEPATH"

### Define Constants
gbd_round_id = 7
decomp_step = "iterative"
loc_h = db.get_location_metadata(35)

### ======================= MAIN EXECUTION ======================= ###

### LOAD DRAWS 
draws_df = pd.read_csv(f'{interms_dir}/FILEPATH')
draw_cols = draws_df.columns[draws_df.columns.str.contains('draw')]


### TRIM 2019 TO ENDEMIC LOCS
eth_subs = loc_h.loc[loc_h.parent_id == 179, ['location_id', 'location_name']]
end_locations = loc_h.loc[loc_h.location_name.isin(['Chad', 'Mali', 'South Sudan']), 
                                                ['location_id', 'location_name']]
end_locations = end_locations.append(eth_subs)
draws_df.loc[(~draws_df.location_id.isin(end_locations.location_id.unique())
              & (draws_df.year_id == 2019)
コード例 #14
0
def outpatient_elmo(df, gbd_round_id, make_right_inclusive=True):
    """
    Function that prepares data for upload to the epi database.  Adds a lot of
    columns, renames a lot of columns.

    Args:
        df (Pandas DataFrame) contains outpatient data at the bundle level.
        make_right_inclusive: (bool) This switch changes values in the
            'age_demographer' column and the 'age_end' column.

            If True, 'age_demographer' column will be set to 1. age_end will be
            made have values ending in 4s and 9s. For example, these age groups
            would be 5-9, 10-14, ... That means that an age_end is inclusive.
            That is, a value of 9 in age_end means that 9 is included in the
            range.

            If False, then 'age_demographer' will be set to 0 and age_end will
            be right exclusive.  age_end will have values ending in 5s and 0s,
            like 5-10, 10-15, ... That is, a value of 10 in age_end would not
            include 10. It would be ages up to but not including 10.

    Returns:
        Data formatted and ready for uploading to Epi DB.
    """

    if make_right_inclusive:













        assert (df.loc[df.age_end > 1, 'age_end'].values % 5 == 0).all(),\
            """age_end appears not to be a multiple of 5, indicating that
               subtracting 1 is a bad move"""

        df.loc[df.age_end > 1,
               'age_end'] = df.loc[df.age_end > 1, 'age_end'] - 1

        df['age_demographer'] = 1
    else:

        assert (df.loc[df.age_end > 1, 'age_end'].values % 5 != 0).all(),\
            """age_end appears to be a multiple of 5, indicating that
               setting age_demographer to 0 is a bad move."""
        df['age_demographer'] = 0

    df.loc[df.age_end == 1, 'age_demographer'] = 0

    df = df.drop(['source', 'facility_id', 'metric_id'], axis=1)

    df.rename(columns={
        'representative_id': 'representative_name',
        "val_inj_corrected": "cases_inj_corrected",
        'val_corrected': 'cases_corrected',
        'val': 'cases_uncorrected',
        'population': 'sample_size',
        'sex_id': 'sex'
    },
              inplace=True)

    representative_dictionary = {
        -1: "Not Set",
        0: "Unknown",
        1: "Nationally representative only",
        2: "Representative for subnational " + "location only",
        3: "Not representative",
        4: "Nationally and subnationally " + "representative",
        5: "Nationally and urban/rural " + "representative",
        6: "Nationally, subnationally and " + "urban/rural representative",
        7: "Representative for subnational " + "location and below",
        8: "Representative for subnational " + "location and urban/rural",
        9:
        "Representative for subnational " + "location, urban/rural and below",
        10: "Representative of urban areas only",
        11: "Representative of rural areas only"
    }
    df.replace({'representative_name': representative_dictionary},
               inplace=True)

    df['source_type'] = 'Facility - outpatient'
    df['urbanicity_type'] = 'Unknown'
    df['recall_type'] = 'Not Set'
    df['unit_type'] = 'Person'
    df['unit_value_as_published'] = 1
    df['is_outlier'] = 0
    df['sex'].replace([1, 2], ['Male', 'Female'], inplace=True)
    df['measure'].replace(["prev", "inc"], ["prevalence", "incidence"],
                          inplace=True)

    df['mean'] = np.nan
    df['upper'] = np.nan
    df['lower'] = np.nan
    df['seq'] = np.nan
    df['underlying_nid'] = np.nan
    df['sampling_type'] = np.nan
    df['recall_type_value'] = np.nan
    df['uncertainty_type'] = np.nan
    df['uncertainty_type_value'] = np.nan
    df['input_type'] = np.nan
    df['standard_error'] = np.nan
    df['effective_sample_size'] = np.nan
    df['design_effect'] = np.nan
    df['response_rate'] = np.nan
    df['extractor'] = "USERNAME and USERNAME"

    loc_map = get_location_metadata(location_set_id=35,
                                    gbd_round_id=gbd_round_id)
    loc_map = loc_map[['location_id', 'location_name']]
    df = df.merge(loc_map, how='left', on='location_id')

    bundle_name_df = query("SQL", conn_def='epi')

    pre_shape = df.shape[0]
    df = df.merge(bundle_name_df, how="left", on="bundle_id")
    assert df.shape[0] == pre_shape, "added rows in merge"
    assert df.bundle_name.notnull().all().all(), 'bundle name df has nulls'

    print("DONE WITH ELMO")
    return (df)
コード例 #15
0
import pandas as pd

from db_queries import get_cod_data
from db_queries import get_location_metadata

gbd_year = 2019
needed_years = range(1980,(gbd_year+1)) # years used in dataframe

lm = get_location_metadata(location_set_id=22, gbd_round_id=6)

df = get_cod_data(cause_id='618', gbd_round_id=7, decomp_step='step2') # grab the cod data for other hemog
df = df[df['data_type']=='Vital Registration'] # subset cod data to only include VR sources

df = df.merge(lm, on='location_id', how='left')
df = df[df['developed']==u'1']

df = df[['cause_id', 'location_id', 'year', 'age_group_id', 'sex', 'rate']] #subset relevant columns of data
df = df.groupby(['location_id', 'year', 'age_group_id', 'sex']).mean() #take the mean CSMR across location/year/age/sex combos
df = df.reset_index()

pooled_dfs = [] #make an empty list to fill with dataframes of pooled years

'''
loop through each year in the years list, 
1) define the set of years being pooled to this year
2) grab chunks of the df used for pooling
3) take the mean across these years
4) once pooled, update the entry in the year column
5) add the pooled year dataframe to the list
'''
for y in needed_years:
コード例 #16
0
ファイル: step04n_lt_run.py プロジェクト: cheth-rowe/ihmexp
def main(ecode, ncode, platform, year, decomp, version, flat_version):
    toc = time.time()

    dems = db.get_demographics(gbd_team="epi", gbd_round_id=help.GBD_ROUND)
    dm_settings = os.path.join(paths.SHARE_DIR, 'dismod_settings')
    version = version.rstrip()
    dm_dir = os.path.join(paths.DATA_DIR, decomp, inj_info.ECODE_PARENT[ecode],
                          str(version), "dismod_ode", ecode)
    metaloc = db.get_location_metadata(location_set_id=35,
                                       gbd_round_id=help.GBD_ROUND)

    filepath = write_path(ecode, ncode, platform, year, decomp, version)
    locations = help.ihme_loc_id_dict(metaloc, dems['location_id'])

    alldata = []
    value_in = os.path.join(dm_dir, "value_in",
                            "value_in_{}_{}.csv".format(ncode, platform))
    draw_in = os.path.join(dm_settings, "draw_in.csv")
    plain_in = os.path.join(dm_settings, "plain_in.csv")
    effect_in = os.path.join(dm_settings, "effect_in.csv")

    v_in = pd.read_csv(value_in)

    num_locs = len(locations)
    loc_pos = 0
    initime = help.start_timer()
    for locn in locations:
        loc_pos = loc_pos + 1

        for sex in [1, 2]:

            start = help.start_timer()

            if float(v_in.loc[v_in['name'] == 'eta_incidence',
                              'value'][0]) == 0:
                result = pd.DataFrame({'age_group_id': dems['age_group_id']})
                result = result.assign(**{d: 0 for d in help.drawcols()})
                result = help.convert_from_age_group_id(result)
            else:
                data_in = os.path.join(
                    dm_dir, "data_in", locations[locn], str(year), str(sex),
                    ecode, "data_in_{}_{}.csv".format(ncode, platform))

                if ncode in inj_info.EMR_NCODES:
                    rate_in_name = "rate_in_emr.csv"
                else:
                    rate_in_name = "rate_in_no_emr.csv"
                rate_in = os.path.join(paths.DATA_DIR, 'flats',
                                       str(flat_version), 'rate_in', str(year),
                                       str(sex), locations[locn], rate_in_name)

                draw_out_dir = os.path.join(dm_dir,
                                            "prev_results", locations[locn],
                                            str(year), str(sex))
                draw_out = os.path.join(
                    draw_out_dir,
                    "prevalence_{}_{}.csv".format(ncode, platform))
                if not os.path.exists(draw_out_dir):
                    try:
                        os.makedirs(draw_out_dir)
                    except OSError as e:
                        if e.errno != os.errno.EEXIST:
                            raise
                        pass

                result = run_model_injuries(draw_in, data_in, value_in,
                                            plain_in, rate_in, effect_in,
                                            draw_out, 1000)

            result['location_id'] = locn
            result['platform'] = platform

            result['year_id'] = year
            result['sex_id'] = sex

            alldata.append(result)
            help.end_timer(start)
            sys.stdout.flush()  # write to log file
        total_time = (time.time() - initime) / 60.

    final = pd.concat(alldata)

    write_results(final, ecode, ncode, platform, year, decomp, version)
    tic = time.time()
コード例 #17
0
def get_most_detailed(location_set, gbd_round):
    location_df = get_location_metadata(location_set_id=location_set,
                                        gbd_round_id=gbd_round)
    location_df = location_df[location_df['most_detailed'] == 1]
    location_list = location_df['location_id'].tolist()
    return location_list
コード例 #18
0
def main():
    drawdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_'
               'splits/02_exposure_data/final_forecast')
    filepattern = '*.csv'
    files = glob(os.path.join(drawdir, filepattern))

    me_name_to_meid = {
        'stunting_mild': 10557,
        'stunting_moderate': 10556,
        'stunting_severe': 8949,
        'underweight_mild': 10561,
        'underweight_moderate': 10560,
        'underweight_severe': 2540,
        'wasting_mild': 10559,
        'wasting_moderate': 10558,
        'wasting_severe': 8945
    }

    # Location Metadata
    locs = get_location_metadata(location_set_id=35, gbd_round_id=4)
    locs = locs[[
        'location_id', 'parent_id', 'location_name', 'level',
        'location_name_short', 'map_id', 'location_type', 'is_estimate'
    ]]
    # Generate locations to keep: 188 + parents
    locs_to_keep = create_locs_to_keep(locs)
    custom_loc_df = locs[locs.location_id.isin(locs_to_keep)]
    # create main custom tree
    print('{} creating custom tree'.format(pretty_now()))
    custom_tree = create_custom_tree(custom_loc_df)
    index_cols = ['age_group_id', 'sex_id', 'year_id', 'location_id']
    data_cols = ['lower', 'mean', 'upper']
    # Create SDI trees
    sdi_locs = get_location_metadata(location_set_id=40, gbd_round_id=4)
    sdi_locs = sdi_locs[[
        'location_id', 'parent_id', 'location_name', 'level',
        'location_name_short', 'map_id', 'location_type', 'is_estimate'
    ]]
    sdi_ids = [44635, 44634, 44639, 44636, 44637]
    sdi_trees = []
    for _id in sdi_ids:
        print('{} creating sdi tree for {}'.format(pretty_now(), _id))
        thisdf = sdi_locs[sdi_locs.parent_id == _id]
        thisdf = thisdf[thisdf.location_id.isin(locs_to_keep + [_id])]
        sdi_trees.append(create_custom_tree(thisdf))

    # get population
    pops = get_pop()

    for _file in files:
        # Define me_name
        me_name = parse_me_name(_file)
        meid = me_name_to_meid[me_name]
        print('{} processing file: {}'.format(pretty_now(), meid))
        df = pd.read_csv(_file)
        df.rename(columns={
            'worse': 'lower',
            'reference': 'mean',
            'better': 'upper'
        },
                  inplace=True)
        df = df[[
            'location_id', 'age_group_id', 'sex_id', 'year_id', 'lower',
            'mean', 'upper'
        ]]
        df['modelable_entity_id'] = meid
        # Remove bad locations
        bad_locs = [298, 305, 349, 351, 376, 385, 422, 433, 434, 4636, 4749]
        df = df[~df.location_id.isin(bad_locs)]
        # convert to counts
        print('{} convert to counts before aggregation'.format(pretty_now()))
        df = df.merge(pops, on=index_cols, how='left')
        for i in data_cols:
            df[i] = df[i] * df['pop_scaled']
        # aggregate all trees
        print('{} agg custom loc tree'.format(pretty_now()))
        agg_results = agg_hierarchy(custom_tree,
                                    df,
                                    index_cols,
                                    data_cols,
                                    dimension='location_id')
        for sdi_tree in sdi_trees:
            print('{} agg sdi tree for: {}'.format(pretty_now(),
                                                   sdi_tree.root))
            this_agg = agg_hierarchy(sdi_tree,
                                     df,
                                     index_cols,
                                     data_cols,
                                     dimension='location_id')
            this_agg = this_agg[this_agg.location_id.isin(sdi_ids)]
            agg_results = agg_results.append(this_agg)

        # copy data and set metric id to 1 for counts
        print('{} copy counts to new df'.format(pretty_now()))
        agg_counts = agg_results.copy()
        agg_counts = agg_counts[agg_counts.sex_id.isin([1, 2])]
        sex_agg = agg_sexes(agg_counts, pops)
        agg_counts = agg_counts.append(sex_agg)
        agg_counts['metric_id'] = 1
        # convert back to rate space
        print('{} converting back to rate space'.format(pretty_now()))
        agg_results = agg_results.merge(pops, on=index_cols, how='left')
        for i in data_cols:
            agg_results[i] = agg_results[i] / agg_results['pop_scaled']
        agg_results['metric_id'] = 3
        print('{} append counts to rate df'.format(pretty_now()))
        agg_results = agg_results.append(agg_counts)

        outdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_'
                  'age_splits/03_exposure_loc_aggregates/02_forecast_'
                  'prevalence/{}'.format(meid))
        outfile = '{}_prevalence_estimates.csv'.format(meid)
        print('{} saving as csv'.format(pretty_now()))
        agg_results.to_csv(os.path.join(outdir, outfile), index=False)
        print('{} finished processing meid: {}'.format(pretty_now(), meid))
コード例 #19
0
def match_to_gbd_locations(in_df,
                           location_set_id=21,
                           fuzzy_match=True,
                           fm_top_cutoff=90,
                           fm_dist_cutoff=20):
    print("Beginning direct string matching to GBD locations...")
    # Get location metadata for matching
    gbd_meta = get_location_metadata(location_set_id=21)
    ## Add some columns to the input dataframe without suffixes
    for col in ['admin1', 'admin2', 'admin3', 'location']:
        in_df["{}__short".format(col)] = in_df[col].apply(remove_suffixes)
    for col in ['location_name', 'location_name_short', 'location_ascii_name']:
        gbd_meta["{}__short".format(col)] = gbd_meta[col].apply(
            remove_suffixes)
    # Build the list of columns that will be matched, in order
    in_df_cols = list()
    meta_cols = list()
    for in_named_col in ['admin1', 'admin2', 'admin3', 'location']:
        for meta_named_col in [
                'location_name', 'location_name_short', 'location_ascii_name'
        ]:
            for suffix1 in ["", "__short"]:
                for suffix2 in ["", "__short"]:
                    in_df_cols.append("{}{}".format(in_named_col, suffix1))
                    meta_cols.append("{}{}".format(meta_named_col, suffix2))
    in_df_cols = in_df_cols + [
        "location_id", "iso", "country", "country", "country"
    ]
    meta_cols = meta_cols + [
        "location_id", "ihme_loc_id", "location_name", "location_name_short",
        "location_ascii_name"
    ]
    ## Iteratively merge, adding only NaN columns on each merge
    # Subset to most detailed for a first run
    meta_most_detailed = gbd_meta.loc[gbd_meta['most_detailed'] == 1, :]
    # FIRST, run only the most detailed locations
    in_df['location_id_matched'] = np.nan
    joined_df = unified_location_column(in_df,
                                        match_df=meta_most_detailed,
                                        location_columns=in_df_cols,
                                        match_columns=meta_cols,
                                        match_column_to_add='location_id',
                                        new_column_name="location_id_matched")
    # NEXT, run on all locations to catch any not-most-detailed location matches
    joined_df = unified_location_column(in_df,
                                        match_df=gbd_meta,
                                        location_columns=in_df_cols,
                                        match_columns=meta_cols,
                                        match_column_to_add='location_id',
                                        new_column_name="location_id_matched")
    # Use the new location data to join on the 'most-detailed' column
    joined_df = pd.merge(
        left=joined_df,
        right=(gbd_meta.loc[:, ['location_id', 'most_detailed']].rename(
            columns={
                'location_id': 'location_id_matched',
                'most_detailed': 'already_located'
            })),
        on='location_id_matched',
        how='left')
    # If fuzzy_match is true, try using fuzzy string matching to match countries
    #  to their subnational locations
    if fuzzy_match:
        print("Beginning fuzzy matching to GBD locations...")
        joined_df = fuzzy_match_subnationals(
            in_df=joined_df,
            loc_metadata=gbd_meta,
            top_score_cutoff=fm_top_cutoff,
            dist_to_second_score_cutoff=fm_dist_cutoff)
    # Cleanup
    joined_df.loc[joined_df['location_id'].isnull(),
                  'location_id'] = joined_df.loc[
                      joined_df['location_id'].isnull(), 'location_id_matched']
    joined_df = joined_df.drop([
        'admin1__short', 'admin2__short', 'admin3__short',
        'location_id_matched'
    ],
                               axis=1)
    return joined_df
コード例 #20
0
def main(ecode, ncode, platform, version):
    
    start = help.start_timer()
    
    parent = inj_info.ECODE_PARENT[ecode]
    flat_version = versions.get_env(parent, version)
    
    # get demographics
    print("1. Getting demographic, location, and long-term probabilities...")
    dems = db.get_demographics(gbd_team = "epi", gbd_round_id=help.GBD_ROUND)
    metaloc = db.get_location_metadata(location_set_id=35, gbd_round_id=help.GBD_ROUND)
    locations = help.ihme_loc_id_dict(metaloc, dems['location_id'])
    
    # get long-term probabilities that will be used and long-term standardized-mortality ratios
    lt_probs = calculate_measures.long_term_probs_combined(ncode=ncode)
    smr = load_measures.smr(ncode)
    
    # define DisMod ODE input directory
    dm_out_dir = os.path.join("FILEPATH")
    
    # make the sub-directory for data in files:
    folder = os.path.join("FILEPATH")
    if not os.path.exists(folder):
        try:
            os.makedirs(folder)
        except OSError as e:
            if e.errno != os.errno.EEXIST:
                raise
            pass
    
    print("2. Looping through years and sexes to make rate-in and data-in files.")

    value_data = []

    for year in dems["year_id"]:
        for sex in dems["sex_id"]:
            measures = {}
            print('Working on year {} sex {}'.format(year, sex))

            incidence = calculate_measures.long_term_incidence(ecode, version, ncode, platform, year, sex, lt_probs)
            inc_mean = incidence.mean(dim='draw')
            # if the value is less then one in a trillion, set to 0. Otherwise, DisMod can have an overflow issue where
            #    it sets prevalence to 100%
            inc_summary = xr.merge([inc_mean.where(inc_mean > .000000000001, 0).rename('meas_value'),
                                    incidence.std(dim='draw').rename('meas_stdev')])
            measures['incidence'] = inc_summary
            if ncode in inj_info.EMR_NCODES:
                emr = calculate_measures.emr(smr, year, sex, flat_version)
                emr_summary = xr.merge([emr.mean(dim='draw').rename('meas_value'),
                                        emr.std(dim='draw').rename('meas_stdev')])
                measures['mtexcess'] = emr_summary
            
            print('Making data in')
            data = make_data_in(measures, ecode, version, ncode, platform, locations, year, sex)

            value_data.append(data)

            sys.stdout.flush()
                        
    print("Finished making data in files.")
    print("4. Now making the value-in file with the saved data from data in process...")
    
    make_value_in(value_data, ecode, ncode, platform, dm_out_dir)
    
    help.end_timer(start)
コード例 #21
0
def apply_corrections(df, use_modified):
    """
    Applies the marketscan correction factors to the hospital data at the
    bundle level.  The corrections are merged on by 'age_start', 'sex_id',
    and 'bundle_id'.

    With the new cf uncertainty our process has been updated and this only
    applies to the sources with full care coverage.

    Parameters:
        df: Pandas DataFrame
            Must be aggregated and collapsed to the bundle level.
    """

    assert "bundle_id" in df.columns, "'bundle_id' must exist."
    assert "nonfatal_cause_name" not in df.columns, (
        "df cannot be at the baby ", "sequelae level")

    start_columns = df.columns

    # get a list of files, 1 for each type of CF
    if use_modified:
        corr_files = glob.glob(root + r"{FILEPATH}/mod_*.csv")
        idx = -4
        id_cols = ['age_start', 'sex_id', 'cf_location_id', 'bundle_id']

    else:
        corr_files = glob.glob(root + r"{FILEPATH}/*sm.csv")
        idx = -6
        id_cols = ['age_start', 'sex', 'bundle_id']

    corr_list = []  # to append the CF DFs to
    cf_names = []  # to apply the cfs
    for f in corr_files:
        # pull out the name of the correction type
        draw_name = os.path.basename(f)[:idx]
        if use_modified:
            draw_name = draw_name[4:]
        cf_names.append(draw_name)
        # read in a file
        dat = pd.read_csv(f)
        # rename the mean draw name cols back to just draw name
        if use_modified:
            dat.rename(columns={'mean_' + draw_name: draw_name}, inplace=True)
        if "Unnamed: 0" in dat.columns:
            dat.drop("Unnamed: 0", 1, inplace=True)
        pre_rows = dat.shape[0]

        # only need to take the mean if it's not modeled/modifed CF data
        if not use_modified:
            # get the draw col names
            draw_cols = dat.filter(regex=draw_name).columns
            assert len(draw_cols) == 1000, "wrong number of draw cols"

            # create the single mean value from all the draws
            dat[draw_name] = dat[draw_cols].mean(axis=1)
            # drop the draw cols
            dat.drop(draw_cols, axis=1, inplace=True)

        assert dat.shape[0] == pre_rows, "The number of rows changed"
        corr_list.append(dat)

        del dat

    # merge the dataframes in the list together
    correction_factors = functools.reduce(
        lambda x, y: pd.merge(x, y, on=id_cols), corr_list)

    if 'sex' in correction_factors.columns:
        # rename columns to match df
        correction_factors.rename(columns={'sex': 'sex_id'}, inplace=True)

    # switch from age group id to age start/end
    df = hosp_prep.group_id_start_end_switcher(df)

    # switch from sex to sex id in our identifier columns
    id_cols = [f + "_id" if f == "sex" else f for f in id_cols]

    pre_shape = df.shape[0]
    if not use_modified:
        # merge corr factors onto data
        df = df.merge(correction_factors, how='left', on=id_cols)

    if use_modified:
        # merge country id aka cf loc id, onto the data in order for the later merge to work
        locs = get_location_metadata(location_set_id=35)[[
            'location_id', 'path_to_top_parent'
        ]]
        locs = pd.concat(
            [locs, locs.path_to_top_parent.str.split(",", expand=True)],
            axis=1)
        locs = locs[locs[3].notnull()]
        locs['cf_location_id'] = locs[3].astype(int)
        locs = locs[['cf_location_id', 'location_id']]
        df = df.merge(locs, how='left', on='location_id')

        # merge CFs onto hosp data
        df = df.merge(correction_factors, how='left', on=id_cols)

    assert pre_shape == df.shape[0], (
        "You unexpectedly added rows while "
        "merging on the correction factors. Don't do that!")

    # drop unneeded cols
    for col in ['super_region_id', 'model_prediction', 'cf_location_id']:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)

    # apply the mean, smoothed corr factors without the env to covered sources
    full_coverage_sources = ["UK_HOSPITAL_STATISTICS"]

    # apply the corrections.
    for level in cf_names:
        df.loc[df.source.isin(full_coverage_sources), "mean_" + level] = \
            df.loc[df.source.isin(full_coverage_sources), "mean_raw"] *\
            df.loc[df.source.isin(full_coverage_sources), level]

    # switch from age_start and age_end back to age_group_id
    df = hosp_prep.group_id_start_end_switcher(df)

    # drop the CF cols. We'll add them manually later for all sources
    df.drop(cf_names, axis=1, inplace=True)

    assert set(start_columns).issubset(set(df.columns)), """
        Some columns that were present at the start are missing now"""

    return (df)
コード例 #22
0
        logging.info("Creating draw source and sink.")
        draw_dir = os.path.join(parent_dir, 'aggregated/{}'.format(df_type))
        input_pattern = '{measure_id}_{location_id}_{year_id}.h5'
        source_config = {'draw_dir': draw_dir, 'file_pattern': input_pattern}
        draw_source = DrawSource(source_config)

        output_pattern = '{measure_id}_{location_id}_{year_id}.h5'
        sink_config = {
            'draw_dir': draw_dir,
            'file_pattern': output_pattern,
            'h5_tablename': 'draws'
        }
        draw_sink = DrawSink(sink_config)

        # Apply regional scalar transform
        region_locs = get_location_metadata(gbd_round_id=GBD.GBD_ROUND_ID,
                                            location_set_id=35)
        region_locs = region_locs[region_locs.level == 2].location_id.tolist()
        draw_sink.add_transform(apply_regional_scalars,
                                region_locs=region_locs,
                                parent_dir=parent_dir)
        draw_sink.add_transform(transform_add_measure, measure_id=measure_id)

        # create operator
        logging.info("Reading regional scalars from flatfiles.")
        index_cols = [col for col in index_cols if col != 'location_id']
        operator = Sum(index_cols, draw_cols)

        # Aggregate
        logging.info("Instantiate aggregator.aggregators.AggMemEff.")
        aggregator = AggMemEff(draw_source=draw_source,
                               draw_sink=draw_sink,
コード例 #23
0
df.groupby("year_start").agg({"contacts": "sum", "patients": "sum"}).reset_index()

# check the diagnosis col
(df.diagnosis.str.upper() == df.diagnosis).all()
assert (df.diagnosis.str.upper() == df.diagnosis).all()

# make sure nulls aren't introduced
county_nulls = df.county.isnull().sum()
# manually adjust the county names to fit the spelling in the IHME location table
df.loc[df.county == "Finnmark", 'county'] = "Finmark"
df.loc[df.county == "Hedmark", 'county'] = "Hedemark"
df.loc[df.county.isin(["Nord-Trondelag", "Sor-Trondelag"]), 'county'] = "Trondelag"

df[df.county.isnull()].patients.sum() / float(df.patients.sum())

locs = get_location_metadata(QUERY)
loc_subnats = locs.loc[locs.parent_id == 90, ['location_ascii_name', 'location_id']]
loc_subnats.head(2)
assert set(df.county.unique()) - set(loc_subnats.location_ascii_name.unique()) == set([np.nan])
assert set(loc_subnats.location_ascii_name.unique()) - set(df.county.unique()) == set()

# drop national location id 
df.drop('location_id', axis=1, inplace=True)
df.head(2)

pre = df.shape[0]
df = df.merge(loc_subnats, how='left', left_on='county', right_on='location_ascii_name')
assert pre == df.shape[0]
assert county_nulls == df.county.isnull().sum()

print "shape is {}".format(df.shape)
コード例 #24
0
    the lowest life expectancies among these ten large countries, from XX·X
    (XX·X–XX·X) to XX·X (XX·X–XX·X) years. See appendix 2 (section 3) for
    additional results. 
"""

import xarray as xr
import pandas as pd
import sys

from db_queries import get_location_metadata
from fbd_core.file_interface import FBDPath, open_xr, save_xr
from fbd_core.etl import compute_summaries, expand_dimensions

import settings as sett

LOCS = get_location_metadata(location_set_id=35, gbd_round_id=5)
SUPER_REGS = LOCS[LOCS.level == 1]
NATS = LOCS[LOCS.level == 3]

lex_past_vers = sett.PAST_VERSIONS["lex"].version
lex_past_dir = "/5/past/life_expectancy/"
lex_past_path = FBDPath(lex_past_dir + lex_past_vers)
print(lex_past_vers)

lex_fut_vers = sett.BASELINE_VERSIONS["lex"].version
lex_fut_dir = "/5/future/life_expectancy/"
lex_fut_path = FBDPath(lex_fut_dir + lex_fut_vers)
print(lex_fut_vers)

pop_past_vers = sett.PAST_VERSIONS["population"].version
pop_past_dir = "/5/past/population/"
コード例 #25
0
def setup_for_shiny(df, out_path):
    """
	Description:
		Prepares the final result of the '00_prep_hf_mktscan_parallel.py' for
		a diagnostic visualization.
	Args:
		df (object): pandas dataframe object of input data
		engine (object): ihme_databases class instance with dUSERt
			engine set
		me_id (int): modelable_entity_id for the dataset in memory

	Returns:
		Returns a copy of the dataframe with the seqs filled in,
		increment strarting from the max of the database for the given
		modelable_entity.
	"""

    # columns necessary for creating appending necesary aggregates and adding
    # columns
    # with metadata useful to diagnostics (e.g. location name)
    index_cols = [
        'hf_target_prop', 'std_err_adj', 'sex_id', 'cause_id', 'age_group_id'
    ]

    # columns used for creating aggregates for the region and super region
    # proportions.
    group_cols = ['sex_id', 'cause_id', 'age_group_id']

    # columns used in the final dataset.
    final_cols = [
        'hf_target_prop', 'std_err_adj', 'location_id', 'location_ascii_name',
        'sex_id', 'cause_id', 'age_group_id', 'age_group_name', 'cause_name'
    ]

    locations = get_location_metadata(location_set_id=35)\
                                                       [['location_id',
                                                         'location_ascii_name']]
    ages = get_ids('age_group')
    causes = get_ids('cause')

    # Exclude composite etiologies for input diagnostics
    df = df.query('cause_id not in (520, 385, 499)')

    # location metadata
    df = df.merge(locations, on='location_id', how='inner')

    # add column with age group names
    df = df.merge(ages, on='age_group_id', how='inner')

    # To make the age progression linear and consecutive recode some of the
    # age_groups.
    df['age_group_id'] = df['age_group_id'].replace(to_replace=28, value=4)
    df.sort_values(by='age_group_id', axis=0, ascending=True, inplace=True)

    # add column with cause names
    df = df.merge(causes, on='cause_id', how='inner')

    # drop unnecessary columns
    df = df[final_cols]

    df.rename(columns={'hf_target_prop': 'proportion'}, inplace=True)
    df.rename(columns={'hf_target_prop': 'standard_error'}, inplace=True)

    # write the diagnostic input data to csv
    df.to_csv("{}hf_inputs.csv".format(out_path),
              index=False,
              encoding='utf-8')
コード例 #26
0











df['representative_id'] = 3  


locs = get_location_metadata(location_set_id=9, gbd_round_id=5)
loc_id = locs.loc[locs.location_name == "Jordan", "location_id"]
loc_id = loc_id.tolist()[0]
df['location_id'] = loc_id
assert (df.location_id == 144).all(),\
    "loc id check failed"  


df['age_group_unit'] = 1
df['source'] = 'JOR_ABHD'


df['code_system_id'] = 2

df['year_start'] = 2016  
df['year_end'] = 2016
コード例 #27
0
                     index=[df2.index.values],
                     aggfunc='first')

# calculate proportion of those who received care for their injury
df3['proportion'] = df3['mean']['0100'] / df3['mean']['0000']

df3['sample_size_both'] = df3['sample_size']['0000']
df3.reset_index(inplace=True)

df4 = df3[['index', 'proportion', 'sample_size_both']]
df4.columns = ['demo', 'data', 'sample_size']

df4[['nid', 'ihme_loc_id', 'age_start',
     'age_end']] = pd.DataFrame(df4['demo'].tolist(), index=df4.index)

locs = db.get_location_metadata(location_set_id=35)
df4 = df4.merge(locs[['ihme_loc_id', 'location_id']])

df4.drop(['demo', 'ihme_loc_id'], axis=1, inplace=True)

# prep additional columns for ST-GPR
df4['measure'] = 'proportion'
df4['is_outlier'] = 0
df4['variance'] = ''
df4['sex_id'] = 3
df4['year_id'] = 2003

# get rid of any implausible proportions
df5 = df4[df4['data'] <= 1]

# apply offset in order to model in logit space
コード例 #28
0
ファイル: future_pop.py プロジェクト: atheis4/gates_viz
def main():
    popdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_'
              'splits/01_populations')
    popfile = 'forecast_under_5_pops.csv'

    print('{} read in raw pop data from csv'.format(pretty_now()))
    popdf = pd.read_csv(os.path.join(popdir, popfile))

    age_ranges = ['mean_12_to_23', 'mean_2_to_4']
    age_map = {age_ranges[0]: 238, age_ranges[1]: 34}

    popdf = melt_age_cols(popdf)
    popdf = popdf[popdf.age_group.isin(age_ranges)]
    popdf['age_group_id'] = popdf.age_group.map(age_map)

    for col in ['location_id', 'sex_id', 'age_group_id']:
        popdf[col] = popdf[col].astype(int)

    popdf.rename(columns={'population': 'pop_scaled'}, inplace=True)

    # Location Metadata
    locs = get_location_metadata(location_set_id=35, gbd_round_id=4)
    locs = locs[[
        'location_id', 'parent_id', 'location_name', 'level',
        'location_name_short', 'map_id', 'location_type', 'is_estimate'
    ]]
    # Generate locations to keep: 188 + parents
    locs_to_keep = create_locs_to_keep(locs)
    custom_loc_df = locs[locs.location_id.isin(locs_to_keep)]

    # create main custom tree
    print('{} creating custom tree'.format(pretty_now()))
    custom_tree = create_custom_tree(custom_loc_df)

    index_cols = ['age_group_id', 'sex_id', 'year_id', 'location_id']
    data_cols = ['pop_scaled']

    # aggregate up standard custom tree
    print('{} aggregate pop from custom tree'.format(pretty_now()))
    aggpop = agg_hierarchy(custom_tree, popdf, index_cols, data_cols,
                           'location_id')

    # SDI locations
    sdi_locs = get_location_metadata(location_set_id=40, gbd_round_id=4)
    sdi_locs = sdi_locs[[
        'location_id', 'parent_id', 'location_name', 'level',
        'location_name_short', 'map_id', 'location_type', 'is_estimate'
    ]]
    sdi_ids = [44635, 44634, 44639, 44636, 44637]

    sdi_df_list = []
    for _id in sdi_ids:
        print('{} processing sdi: {}'.format(pretty_now(), _id))
        thisdf = sdi_locs[sdi_locs.parent_id == _id]
        thisdf = thisdf[thisdf.location_id.isin(locs_to_keep + [_id])]
        thistree = create_custom_tree(thisdf)
        print('{} aggregate pop from {} tree'.format(pretty_now(), _id))
        thisaggpop = agg_hierarchy(thistree, popdf, index_cols, data_cols,
                                   'location_id')
        thisaggpop = thisaggpop[thisaggpop.location_id == _id]
        aggpop = aggpop.append(thisaggpop)

    sexagg = agg_sexes(aggpop)
    aggpop = aggpop.append(sexagg)

    outdir = ('/FILEPATH_TO/Child Growth Failure/Gates_CGF_Viz/custom_age_'
              'splits/01_populations')
    outfile = os.path.join(outdir, 'future_pop.h5')

    print('{} output'.format(pretty_now()))
    aggpop.to_hdf(outfile,
                  'data',
                  mode='w',
                  format='table',
                  data_columns=index_cols)
    print('{} fin'.format(pretty_now()))
コード例 #29
0
ファイル: squeeze_main.py プロジェクト: zhouxm4/ihme-modeling
	# submit squeeze job for each cause and form a string of job names
	job_string = ''
	for i in [1990, 1995, 2000, 2005, 2010, 2016]: 
		year_id = i
		job_name = "squeeze_{0}_{1}".format(year_id, cause_name)
		job_string = job_string + ',' + job_name
		call = ('qsub -l mem_free=20.0G -pe multi_slot 10'
					' -cwd -P proj_custom_models'
					' -o {FILEPATH}'
					' -e {FILEPATH} -N {4}'
					' cluster_shell.sh squeeze.py \'{0}\' {1} {2} {3}'.format(json.dumps(me_map), out_dir, year_id, cause_name,job_name))
		#print call
		subprocess.call(call, shell=True)

	# get location_metadata for graphing step
	loc_df = get_location_metadata(location_set_id=35, gbd_round_id=4)
	loc_df.to_csv(os.path.join(out_dir, 'graphs', 'location_metadata.csv'), encoding='utf-8')

	# graph
	# need-hold_jid + job_string flag to hold jobs until squeezes are done
	# only graphing three estimation years now
	for i in [1990, 2005, 2016]: 
		year_id = i
		#job_string= "no_holds"
		call = ('qsub  -hold_jid {2} -cwd -P proj_custom_models '
					' -o {FILEPATH}'
					' -e {FILEPATH} -N {0}_graph_{1}'
					' r_shell.sh congenital_stacked_bar.R {0} {1}'.format(cause_name, year_id, job_string))
		
		subprocess.call(call, shell=True)
	
コード例 #30
0
def overlay_or_snap_points(point_df,
                           poly_df,
                           location_set_id=21,
                           snap_points=True,
                           update_snapped_points=True):
    '''
    This function takes a geopandas Points GeoDataFrame and Polygons
    GeoDataFrame, then assigns all rows in the Points GeoDataFrame to a single
    polygon in the Polygons GeoDataFrame. It checks for exact overlap, then
    snaps points that fall outside of any polygon (or points that already have 
    identifying information that indicates they do not belong in their current
    polygon).

    Inputs:
      point_df (geopandas GeoDataFrame): The points GeoDataFrame
      poly_df (geopandas GeoDataFrame): The polygons GeoDataFrame
      location_set_id (int): The value of the GBD location set that will be 
        used to determine which most detailed locations align with which (not
        necessarily most detailed) parents
      snap_points (bool): Whether or not to snap points in addition to the overlay
      update_snapped_points (bool): If true, drop the old set of points and 
        update the 'geometry' field of the points gdf to the new, snapped points

    Outputs:
      all_geolocated (geopandas GeoDataFrame): The points GeoDataFrame, where
        a new field "overlay_loc_id" indicates the polygon that the point
        overlaps with or has been snapped to
    '''
    # Input data validation
    assert np.all([
        type(i) is gpd.geodataframe.GeoDataFrame for i in [point_df, poly_df]
    ]), ("The point_df and poly_df"
         " should both be geopandas GeoDataFrames")
    # Copy the original data to allow for in-place changes
    poly_df = poly_df.copy()
    point_df = point_df.copy()
    # Rename the polygon field 'location_id' so it does not overlap with the
    #  points field 'location_id'
    poly_df = poly_df.rename(columns={'location_id': 'overlay_loc_id'})
    # Get location metadata for the known location set
    meta = get_location_metadata(location_set_id=location_set_id)
    # Create a dictionary of the most detailed descendents for each location
    descendents = construct_descendants_dict(meta)
    # Create a field that will be used to validate whether a point has been placed
    #  within a valid geometry
    point_df['known_loc_tag'] = 1
    reference_locations = [int(i) for i in list(descendents.keys())]
    if 'location_id_matched' in point_df.columns:
        point_df.loc[~np.isnan(point_df['location_id_matched']),
                     'known_loc_tag'] = point_df.loc[
                         ~np.isnan(point_df['location_id_matched']),
                         'location_id_matched'].apply(lambda x: 1 if int(
                             x) not in reference_locations else int(x))
    ## Overlay points

    print("* * * * STARTING FIRST OVERLAY * * * * at {}".format(dt.now()))

    all_overlaid = overlay_polygons(points_df=point_df,
                                    polys_df=poly_df,
                                    polys_cols_to_join=['overlay_loc_id'])
    # Check if there were any UIDs on the border that might be duplicated
    border_uids_df = all_overlaid.loc[:, ['uid']]
    border_uids_df['count'] = 1
    border_uids_df = (border_uids_df.groupby(by='uid').sum().reset_index(
        drop=False))
    border_uids = (border_uids_df.loc[border_uids_df['count'] == 2,
                                      'uid'].tolist())
    print("  WARNING: The following UIDs are being duplicated at this stage:")
    print("  {}".format(border_uids))
    print("  These should be assigned beforehand to avoid duplication.\n")

    print("* * * * DONE WITH FIRST OVERLAY * * * * at {}".format(dt.now()))

    # Subset out points that have not been matched to a geography or were
    #  matched to an impossible geometry per the 'valid geometry' field
    all_overlaid['good_match'] = all_overlaid.apply(
        lambda row: (row['overlay_loc_id'] is not np.nan) and
        (row['overlay_loc_id'] in descendents[row['known_loc_tag']]),
        axis=1)
    # If we don't want to snap points, then return the points here
    if not (snap_points):
        return all_overlaid
    # Otherwise, continue on to snapping
    overlaid_good = all_overlaid.loc[all_overlaid['good_match'], :].copy()
    needs_snapping = all_overlaid.loc[~all_overlaid['good_match'], :].copy()
    print("  {} points need to be snapped.".format(needs_snapping.shape[0]))

    print("* * * * * * * * CHECK DF SIZE * * * * * * * *")
    print("  {} points were good.".format(overlaid_good.shape[0]))
    print("  {} combined.".format(all_overlaid.shape[0]))

    # Iterate through each parent geometry, getting the best fit out of all
    #  descendants of that parent geometry. Afterwards, concatenate the results
    #  from all parents into a single dataframe
    print("* * * * STARTING SNAPPING * * * * at {}".format(dt.now()))

    snapped_sub_dfs = list()
    for parent_loc in needs_snapping['known_loc_tag'].dropna().unique().tolist(
    ):
        possible_snap_polys = poly_df.loc[(
            poly_df['overlay_loc_id'].isin(descendents[int(parent_loc)])), :]
        if len(possible_snap_polys) == 0:
            warnings.warn(
                "All location tagging failed for parent location: {}".format(
                    parent_loc))
            continue
        points_to_snap = needs_snapping.loc[needs_snapping['known_loc_tag'] ==
                                            parent_loc, :]
        snapped_sub = snap_points_to_polys_df(
            needs_snapping=points_to_snap,
            polys_df=possible_snap_polys,
            polys_location_col='overlay_loc_id',
            descendents=descendents)
        snapped_sub_dfs.append(snapped_sub)
    snapped = pd.concat(snapped_sub_dfs)
    # Update with the new, snapped points as the geometry
    snap_geom = [
        sly.geometry.Point(xy)
        for xy in zip(snapped['snapped_lon'], snapped['snapped_lat'])
    ]
    snapped = snapped.drop(labels=['geometry', 'snapped_lon', 'snapped_lat'],
                           axis=1)
    snapped = gpd.GeoDataFrame(snapped, crs={'PASSWORD'}, geometry=snap_geom)
    print("\n* * * * DONE WITH SNAPPING * * * * at {}".format(dt.now()))
    print("\n* * * * * * * * CONFIRM SNAP WORKED * * * * * * * *")
    print("  The snapped df now has {} rows (should be same)".format(
        snapped.shape[0]))
    print("  There are {} rows that still don't have a loc_id.".format(
        snapped.loc[
            snapped['overlay_loc_id'].apply(lambda x: x == ''), :].shape[0]))

    # Snapping will add the column "snap_dist"
    #  to the geodataframe. Make these consistent with the overlaid df and
    #  concatenate
    overlaid_good['snap_dist'] = 0
    all_geolocated = pd.concat([overlaid_good, snapped])

    # Add a field giving the location name that each point is now assigned to
    meta_names = meta.loc[:, ['location_id', 'location_ascii_name']]
    meta_names.rename(columns={
        'location_id': 'overlay_loc_id',
        'location_ascii_name': 'overlay_loc_name'
    },
                      inplace=True)
    all_geolocated = all_geolocated.merge(meta_names,
                                          on="overlay_loc_id",
                                          how='left')
    # Delete the field that was used to determine valid locations for snapping
    all_geolocated.drop(labels=['known_loc_tag'], axis=1, inplace=True)

    print("* * * * * * * * CHECK DF SIZE PRESERVED * * * * * * * *")
    print("  {} rows at the end (should be same as beginning).".format(
        all_geolocated.shape[0]))
    # Return the dataframe
    return (all_geolocated)