def check_envelope(envelope_data, eligible_location_ids, eligible_year_ids, eligible_sex_ids, eligible_age_group_ids): # Make sure data is above 0 print("Making sure all draws are above or equal to 0") data_columns = ['env_{}'.format(x) for x in range(1000)] envelope_data['min'] = envelope_data[data_columns].min(axis=1) print("Minimum envelope value: {}".format(envelope_data['min'].min())) if envelope_data['min'].min() < 0: print("ERROR: Draw/pop values in envelope that are less than 0") sys.exit(1) # Make sure all unique IDs are present print("Checking for unique IDs in envelope") uid_template = pd.DataFrame(eligible_location_ids, columns=['location_id']) uid_template = expand_id_set(uid_template, eligible_year_ids, 'year_id') uid_template = expand_id_set(uid_template, eligible_age_group_ids, 'age_group_id') uid_template = expand_id_set(uid_template, eligible_sex_ids, 'sex_id') envelope_data['_check'] = 1 envelope_data = pd.merge( uid_template, envelope_data, on=['location_id', 'year_id', 'sex_id', 'age_group_id'], how='left') if len(envelope_data.loc[envelope_data['_check'].isnull()]) > 0: print("ERROR: Missing unique IDs from envelope") sys.exit(1) else: print("No missing unique IDs in envelope")
def check_pred_ex(eligible_location_ids, eligible_year_ids, eligible_sex_ids, eligible_age_group_ids, fail=True): pred_ex_data = get_life_table(location_set_id=35, life_table_parameter_id=6) uid_template = pd.DataFrame(eligible_location_ids, columns=['location_id']) uid_template = expand_id_set(uid_template, eligible_year_ids, 'year_id') uid_template = expand_id_set(uid_template, eligible_age_group_ids, 'age_group_id') uid_template = expand_id_set(uid_template, eligible_sex_ids, 'sex_id') logger = logging.getLogger('error_check.check_pred_ex') try: # all of CoDCorrect will break if there is no version id at all assert pred_ex_data.process_version_map_id.unique().item( ) is not None, ("No version id uploaded for pred_ex") except AssertionError as e: logger.exception('Failed to get life_table version: {}'.format(e)) sys.exit() try: # non-yll CoDcorrect will be fine and deaths will still be calculated assert len(pred_ex_data.process_version_map_id.unique() ) == 1, "More than one life table version returned" # Make sure data is above 0 logger.info("Making sure all draws are above or equal to 0") data_columns = ['mean'] pred_ex_data['min'] = pred_ex_data[data_columns].min(axis=1) logger.info("Minimum pred_ex value: {}".format( pred_ex_data['min'].min())) if pred_ex_data['min'].min() < 0: raise ValueError('ERROR: Draw/pop values in pred_ex that are less ' 'than 0') # Make sure all unique IDs are present logger.info("Checking for unique IDs in pred_ex") pred_ex_data['_check'] = 1 pred_ex_data = pd.merge( uid_template, pred_ex_data, on=['location_id', 'year_id', 'sex_id', 'age_group_id'], how='left') if len(pred_ex_data.ix[pred_ex_data['_check'].isnull()]) > 0: raise ValueError("ERROR: Missing unique IDs from pred_ex") else: logger.info("No missing unique IDs in pred_ex") except (AssertionError, ValueError) as e: logger.warning("Failed to validate pred_ex: {}".format(e)) if fail: sys.exit() return int(pred_ex_data.ix[0, 'process_version_map_id'])
def check_pred_ex(pred_ex_data, eligible_location_ids, eligible_year_ids, eligible_sex_ids, eligible_age_group_ids, fail=True): data = pred_ex_data.copy(deep=True) uid_template = pd.DataFrame(eligible_location_ids, columns=['location_id']) uid_template = expand_id_set(uid_template, eligible_year_ids, 'year_id') uid_template = expand_id_set(uid_template, eligible_age_group_ids, 'age_group_id') uid_template = expand_id_set(uid_template, eligible_sex_ids, 'sex_id') logger = logging.getLogger('error_check.check_pred_ex') try: # non-yll CoDcorrect will be fine and deaths will still be calculated # Make sure data is above 0 logger.info("Making sure all draws are above or equal to 0") data_columns = ['mean'] pred_ex_data['min'] = pred_ex_data[data_columns].min(axis=1) logger.info("Minimum pred_ex value: {}".format( pred_ex_data['min'].min())) if pred_ex_data['min'].min() < 0: raise ValueError('ERROR: Draw/pop values in pred_ex that are less ' 'than 0') # Make sure all unique IDs are present logger.info("Checking for unique IDs in pred_ex") pred_ex_data['_check'] = 1 pred_ex_data = pd.merge( uid_template, pred_ex_data, on=['location_id', 'year_id', 'sex_id', 'age_group_id'], how='left') if len(pred_ex_data.loc[pred_ex_data['_check'].isnull()]) > 0: raise ValueError("ERROR: Missing unique IDs from pred_ex") else: logger.info("No missing unique IDs in pred_ex") except (AssertionError, ValueError) as e: logger.warning("Failed to validate pred_ex: {}".format(e)) if fail: sys.exit(1)
# Set the eligible locations, years, sexes, and ages that will appear in the input data eligible_age_group_ids = range(2, 22) eligible_sex_ids = [1, 2] eligible_cause_ids = cause_data.ix[cause_data['level'] > 0, 'cause_id'].tolist() eligible_year_ids = range(1980, 2016) eligible_location_ids = location_data.ix[ location_data['is_estimate'] == 1, 'location_id'].tolist() # Pull Space-Time (Geographic) restrictions spacetime_restrictions = get_spacetime_restrictions() # Create a DataFrame of all eligible cause, age, sex combinations eligible_data = pd.DataFrame(eligible_cause_ids, columns=['cause_id']) eligible_data = expand_id_set(eligible_data, eligible_age_group_ids, 'age_group_id') eligible_data = expand_id_set(eligible_data, eligible_sex_ids, 'sex_id') # Add a restriction variable to the eligible DataFrame to factor in age-sex restrictions of causes eligible_data['restricted'] = True for cause_id in eligible_cause_ids: non_restricted_age_group_ids = get_eligible_age_group_ids( cause_metadata[cause_id]['yll_age_start'], cause_metadata[cause_id]['yll_age_end']) non_restricted_sex_ids = get_eligible_sex_ids( cause_metadata[cause_id]['male'], cause_metadata[cause_id]['female']) eligible_data.ix[(eligible_data['cause_id'] == cause_id) & ( (eligible_data['age_group_id']. isin(non_restricted_age_group_ids)) &
# Read in config variables eligible_year_ids = config['eligible_year_ids'] index_columns = config['index_columns'] index_columns.remove('measure_id') data_columns = config['data_columns'] envelope_index_columns = config['envelope_index_columns'] envelope_column = config['envelope_column'] raw_data_columns = (['model_version_id'] + [envelope_column] + index_columns + data_columns) # Make eligible data for data logging.info("Make eligible data list") eligible_data = eligible_data.loc[ eligible_data['sex_id'] == int(sex_id)] eligible_data = expand_id_set(eligible_data, eligible_year_ids, 'year_id') eligible_data['location_id'] = int(location_id) # Merge on space-time restrictions spacetime_restriction_data['spacetime_restriction'] = True eligible_data = pd.merge(eligible_data, spacetime_restriction_data, on=['location_id', 'year_id', 'cause_id'], how='left') # Apply space-time restrictions eligible_data.loc[eligible_data['spacetime_restriction'] == True, 'restricted'] = True eligible_data = eligible_data.loc[:, ['cause_id', 'age_group_id', 'sex_id', 'restricted', 'level', 'parent_id', 'year_id',