Ejemplo n.º 1
0
    def get_parentmapped_garbage(self, df):

        assert 'code_id' in df.columns, \
            "Need a code_id to map to packages, but columns " \
            "were: {}".format(df.columns)

        package_id_to_parent_id = self.cause_package_hierarchy.set_index(
            'package_id', verify_integrity=True)['parent_id']

        value_to_package_id = self.package_map.set_index(
            'value', verify_integrity=True)['package_id']

        df = add_code_metadata(df, 'value', code_map=self.code_map)
        if self.remove_decimal:
            df['value'] = df['value'].str.replace(".", "")

        df['package_id'] = df['value'].map(value_to_package_id)
        df['parent_id'] = df['package_id'].map(package_id_to_parent_id)

        # still need to know number of deaths in non-garbage in some applications, to
        # get the sample size for example
        df.loc[df['package_id'].isnull(), 'parent_id'] = -1
        # keep only
        df['cause_id'] = df['parent_id']
        df = df.groupby(AGGREGATION_IDX_COLS, as_index=False)['deaths'].sum()
        return df
Ejemplo n.º 2
0
def check_vr_raw_causes(df):
    """Check for common mistakes in cause formatting for VR data."""
    if len(df.loc[df['data_type_id'].isin([9, 10])]) > 0:
        # only have checks for ICD10 and ICD9 detail data at the moment
        # to add more checks, remove this this if block
        if len(df.loc[df['code_system_id'].isin([1, 6])]) > 0:
            message = ""
            code_system_ids = df['code_system_id'].unique()
            for code_system_id in code_system_ids:
                cs_df = df.query("code_system_id == {}".format(code_system_id))
                cs_df = add_code_metadata(cs_df,
                                          'value',
                                          code_system_id=code_system_id)
                if code_system_id == 6:
                    # check for N codes
                    ncode_df = cs_df.loc[cs_df['value'].str.contains('^[89]')]
                    if len(ncode_df) > 0:
                        message = "!!CONFIRM OR CHANGE TO E CODES!! \nNature of injury"\
                            " codes will be mapped to garbage \n{}".format(ncode_df.head())
                if code_system_id == 1:
                    ucode_df = cs_df.loc[cs_df['value'].str.startswith('U0')]
                    if len(ucode_df) > 0:
                        message = "These codes should only be in US data"\
                            " \n{}".format(ucode_df)
                    stcode_df = cs_df.loc[cs_df['value'].str.contains('^[ST]')]
                    if len(stcode_df) > 0:
                        message = "Data contain S/T codes that will"\
                            " mostly be mapped to garbage \n{}".format(stcode_df)
                if message != "":
                    warnings.warn(message)
def calculate_cc_code(df, env_meta_df, code_map):

    df_cc = df.copy()

    # groupby everything except cause + code_id
    group_cols = [
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid',
        'extract_type_id', 'site_id'
    ]
    df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum()

    # merge on envelope
    df_cc = add_envelope(df_cc, env_df=env_meta_df)
    df_cc['value'] = 'cc_code'
    df_cc = add_code_metadata(df_cc, ['code_id'],
                              merge_col='value',
                              code_map=code_map)
    report_if_merge_fail(df_cc, ['code_id'], ['value'])
    df_cc['cause_id'] = 919
    df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths']
    assert df_cc.notnull().values.any()

    # append together
    df = pd.concat([df, df_cc], ignore_index=True)

    assert np.isclose(df['deaths'].sum(), df.mean_env.sum())
    df = df.drop(['mean_env', 'value'], axis=1)

    return df
Ejemplo n.º 4
0
def calculate_cc_code(df, env_meta_df, code_map):
    """Calculate total deaths denominator.

    Note: This step is usually done in formatting. Moving this calculation
    after age/sex splitting should return more accurate results for data that
    has a mix of known, detailed age groups and unknown ages.
    """
    df_cc = df.copy()

    # groupby everything except cause + code_id
    group_cols = [
        'location_id', 'year_id', 'sex_id', 'age_group_id', 'nid',
        'extract_type_id', 'site_id'
    ]
    df_cc = df_cc.groupby(group_cols, as_index=False).deaths.sum()

    # merge on envelope
    df_cc = add_envelope(df_cc, env_df=env_meta_df)
    df_cc['value'] = 'cc_code'
    df_cc = add_code_metadata(df_cc, ['code_id'],
                              merge_col='value',
                              code_map=code_map)
    report_if_merge_fail(df_cc, ['code_id'], ['value'])
    df_cc['cause_id'] = 919
    df_cc['deaths'] = df_cc['mean_env'] - df_cc['deaths']
    assert df_cc.notnull().values.any()

    # append together
    df = pd.concat([df, df_cc], ignore_index=True)

    assert np.isclose(df['deaths'].sum(), df.mean_env.sum())
    df = df.drop(['mean_env', 'value'], axis=1)

    return df
Ejemplo n.º 5
0
    def assert_valid_mappings(self, df, code_system_id):
        """Test that the mapping worked.

        Runs a suite of assertions to make sure that mapping was successful.
        Args:
            df (DataFrame): with at least code_id and cause_id
        Returns:
            None
        Raises:
            AssertionError: Any condition fails
        """
        # add code value from cached code map
        print("Adding value")
        df = add_code_metadata(df, ['value'],
                               code_system_id,
                               force_rerun=False,
                               block_rerun=True,
                               cache_dir=self.cache_dir)
        report_if_merge_fail(df, 'value', 'code_id')
        # get acause from cached cause hierarchy
        print("Adding acause")
        df = add_cause_metadata(df, ['acause'],
                                cause_set_version_id=self.cause_set_version_id,
                                force_rerun=False,
                                block_rerun=True,
                                cache_dir=self.cache_dir)
        report_if_merge_fail(df, 'acause', 'cause_id')

        # Test that all causes starting with 'acause_' are mapped correctly.
        # acause_cvd, for example, should be mapped to 'cvd' (not 'cvd_ihd').
        # 'acause__gc_X59' should be mapped to '_gc', etc.
        print("Checking implied acauses")
        check_df = df.loc[df['value'].str.startswith('acause_')]
        check_df['implied_acause'] = \
            check_df['value'].str.replace('acause_', '', 1)

        check_df.loc[check_df['value'].str.contains("acause__gc"),
                     'implied_acause'] = "_gc"
        bad_df = check_df.loc[check_df['acause'] != check_df['implied_acause']]
        if len(bad_df) > 0:
            bad_stuff = bad_df[['value', 'acause']].drop_duplicates()
            raise AssertionError(
                "These code values do not match their acause: "
                "\n{}".format(bad_stuff))

        print("Checking for bad values")
        # assert incorrect acauses are gone
        bad_acauses = [
            'acause_digest_gastrititis', 'acause_hiv_tb', 'acause_tb_drug'
        ]

        bad_df = df.loc[df['value'].isin(bad_acauses)].value.unique()
        if len(bad_df) > 0:
            raise AssertionError(
                "Found these bad code values in the data: {}".format(
                    bad_stuff))
Ejemplo n.º 6
0
    def get_computed_dataframe(self, df):
        """Return mapped dataframe."""
        # list of all cause columns
        raw_cause_cols = MCoDMapper.get_code_columns(df)
        df = MCoDMapper.fix_icd_codes(df, raw_cause_cols, self.code_system_id)

        print_log_message("Mapping underlying cause/primary diagnosis")
        cause_map = get_cause_map(code_map_version_id=self.code_map_version_id,
                                  **self.cache_options)
        code_map = MCoDMapper.prep_cause_map(cause_map)
        df['cause_mapped'] = df['cause'].map(code_map)

        print_log_message(
            "Trimming ICD codes and remapping underlying cause/primary diagnosis"
        )
        df = MCoDMapper.trim_and_remap(df, {'cause': 'cause_mapped'}, code_map,
                                       self.code_system_id)
        report_if_merge_fail(df, 'cause_mapped', 'cause')

        # merge on the cause_id for the underlying cause
        df = df.rename(columns={'cause_mapped': 'code_id'})
        df['code_id'] = df['code_id'].astype(int)
        df = add_code_metadata(df,
                               'cause_id',
                               code_map_version_id=self.code_map_version_id,
                               **self.cache_options)
        report_if_merge_fail(df, 'cause_id', 'code_id')

        print_log_message("Mapping chain causes")
        # get the special intermediate cause map
        int_cause_map = self.prep_int_cause_map()
        df = MCoDMapper.map_cause_codes(df, int_cause_map, self.int_cause)

        print_log_message("Trimming ICD codes and remapping chain causes")
        int_cause_cols = [x for x in df.columns if self.int_cause in x]
        int_cause_col_dict = MCoDMapper.prep_raw_mapped_cause_dictionary(
            raw_cause_cols, int_cause_cols)
        df = MCoDMapper.trim_and_remap(df, int_cause_col_dict, int_cause_map,
                                       self.code_system_id)

        print_log_message(
            "Identifying rows with intermediate cause of interest")
        df = self.capture_int_cause(df, int_cause_cols)
        if not self.drop_p2:
            df = self.set_part2_flag(df)

        return df
Ejemplo n.º 7
0
    def get_computed_dataframe(self, df, code_system_id):

        # make special cause adjustments
        df = self.special_cause_reassignment(df, code_system_id)
        """Map code id to cause id."""
        print_log_message("Merging with cause map")
        # get code metadata from a file already cached
        df = add_code_metadata(df, ['cause_id'],
                               code_system_id,
                               code_map=self.code_map)
        report_if_merge_fail(df, 'cause_id', 'code_id')

        print("Asserting it's all good")
        self.assert_valid_mappings(df, code_system_id)
        df = self.drop_unnecessary_causes(df, self.unnecessary_causes)
        print("Collapsing")
        df = self.collapse_and_sum_by_deaths(df)
        return df
def assign_code_to_created_target_deaths(df, code_system_id, cause_meta_df):
    created = df[df['_merge'] == 'right_only']
    original = df[df['_merge'] != 'right_only']
    created = add_cause_metadata(created,
                                 'acause',
                                 cause_meta_df=cause_meta_df)
    created['value'] = created['acause'].apply(lambda x: 'acause_' + x)
    created.drop(['code_id', 'acause'], axis=1, inplace=True)
    created = add_code_metadata(created,
                                'code_id',
                                code_system_id=code_system_id,
                                merge_col='value',
                                cache_dir=CONF.get_directory('db_cache'),
                                force_rerun=False,
                                block_rerun=True)
    report_if_merge_fail(created, 'code_id', ['value'])
    df = original.append(created)
    df.drop(['_merge', 'value'], axis=1, inplace=True)
    return df
def add_packages(df, code_system_id, remove_decimal, package_dir):
    '''
    Assing map value to garbage based on package
    '''
    df = add_code_metadata(df, ['value'],
                           code_system_id=code_system_id,
                           force_rerun=False,
                           block_rerun=True)
    df['value'] = clean_icd_codes(df['value'], remove_decimal)
    df = assign_packages(df, code_system_id, remove_decimal, package_dir)
    df.drop('value', axis=1, inplace=True)
    assert len(df.loc[(df.cause_id != 743) &
                      (df.map_id.str.contains('_p_'))]) == 0, \
        'Code(s) mapped to both a cause and a package'
    bad_garbage = df.loc[(df.cause_id == 743)
                         & ~(df.map_id.str.contains('_p_'))]
    assert len(bad_garbage) == 0, \
        'Code(s) mapped to garbage but not a package: {}'.format(bad_garbage)

    return df
Ejemplo n.º 10
0
 def add_map_ids(self, df):
     '''Assing map value to garbage based on package id.'''
     df = add_code_metadata(df, ['value'],
                            code_map_version_id=self.code_map_version_id,
                            **self.block_rerun)
     df['value'] = clean_icd_codes(df['value'], self.remove_decimal)
     # we do this extra step in downloading packages for ICD10, ICD9_detail
     if self.code_system_id in [1, 6]:
         df = remove_five_plus_digit_icd_codes(
             df, code_system_id=self.code_system_id, trim=True)
     df = self.assign_packages(df)
     # some checks
     garbage_cause_id = (df.cause_id == 743)
     garbage_map_id = (df.map_id.str.contains('_p_', na=False))
     bad_codes = df.loc[~garbage_cause_id & garbage_map_id,
                        ['value', 'map_id', 'cause_id']].drop_duplicates()
     assert len(bad_codes) == 0, \
         'Code(s) mapped to both a cause and a package: {}'.format(bad_codes)
     bad_garbage = df.loc[garbage_cause_id & ~garbage_map_id,
                          ['value', 'map_id']].drop_duplicates()
     assert len(bad_garbage) == 0, \
         'Code(s) mapped to garbage but not a package: {}'.format(bad_garbage)
     df.drop('value', axis=1, inplace=True)
     return df
Ejemplo n.º 11
0
    def get_computed_dataframe(self):

        keep_cols = self.df.columns

        if not self.country_needs_correction():
            print_log_message("Country doesn't need hiv correction")
            self.diag_df = None
            return self.df

        print_log_message("Getting rates df")
        rates_df = self.get_rates_df(self.cause_meta_df)
        if self.correct_garbage:
            df = add_code_metadata(self.df,
                                   add_cols=['value'],
                                   code_system_id=self.code_system_id,
                                   force_rerun=False,
                                   block_rerun=True,
                                   cache_dir=self.cache_dir)
            df = self.identify_sepsis_gc(df, self.code_system_id)
            df = self.identify_injury_gc(df, self.code_system_id)
            df = self.identify_hivrd_gc(df, self.code_system_id)
            group_cols = [
                x for x in keep_cols if x not in ['code_id', 'deaths']
            ]
            df_by_code = df.copy()
            df_by_cause = df.groupby(group_cols,
                                     as_index=False)['deaths'].sum()
        else:
            df_by_cause = self.df
        df = add_population(df_by_cause, pop_df=self.pop_df)
        print_log_message("Flagging correct dem groups for "
                          "{0} rows of data".format(len(df)))
        df = flag_correct_dem_groups(df,
                                     self.code_system_id,
                                     self.cause_meta_df,
                                     self.loc_meta_df,
                                     self.age_meta_df,
                                     rates_df,
                                     self.reference_ages,
                                     self.move_gc_age_restrictions,
                                     self.value_cols,
                                     self.pop_col,
                                     self.cause_selections_path,
                                     correct_garbage=self.correct_garbage)
        cause_to_targets_map = self.get_cause_to_targets_map(
            self.cause_meta_df)
        print_log_message("Identifying positive excess")
        df = identify_positive_excess(df, rates_df, cause_to_targets_map,
                                      self.reference_ages, self.loc_meta_df,
                                      self.cause_meta_df, self.value_cols,
                                      self.pop_col, self.correct_garbage)
        if self.correct_garbage:
            df = self.calculate_garbage_positive_excess(
                df, df_by_code, group_cols)
            print_log_message("Moving excess to target")
            df = move_excess_to_target(df, self.value_cols,
                                       cause_to_targets_map,
                                       self.correct_garbage)
            computed_df = assign_code_to_created_target_deaths(
                df, self.code_system_id, self.cause_meta_df)
        else:
            print_log_message("Moving excess to target")
            computed_df = move_excess_to_target(df, self.value_cols,
                                                cause_to_targets_map,
                                                self.correct_garbage)
        self.diag_df = computed_df
        return computed_df[keep_cols]
Ejemplo n.º 12
0
    def set_restricted_cause(self, df):
        """Run a set of manual replacements, according to expert opinion."""

        # based on first letter of icd code, certain values chould be filled in
        mapping_icd10 = {
            'A': 'B99.9',
            'B': 'B99.9',
            'C': 'D49.9',
            'D': 'D49.9',
            'I': 'I99.9',
            'J': 'J98.9',
            'K': 'K92.9',
            'V': 'Y89',
            'Y': 'Y89'
        }

        # add value field
        df = add_code_metadata(df, ['value'], self.code_system_id,
                               **self.standard_cache_options)
        report_if_merge_fail(df, 'value', 'code_id')
        df = df.rename(columns={'value': 'raw_cause'})

        # generate new column called "restricted_cause"
        # ZZZ is the default for all code systems
        raw_causes = self.prep_code_metadata()
        assert "ZZZ" in raw_causes.raw_cause.unique(), \
            "ZZZ must be in the map"
        df['restricted_cause'] = "ZZZ"
        df['restricted_code_id'] = raw_causes.query(
            "raw_cause == 'ZZZ'")["code_id"].values[0]
        df['restricted_cause_id'] = raw_causes.query(
            "raw_cause == 'ZZZ'")["cause_id"].values[0]

        # restrictions if code system is ICD10
        if self.code_system_id == 1:
            for key in mapping_icd10.keys():
                raw_cause = mapping_icd10[key]
                code_list = raw_causes.query(
                    "raw_cause == '{}'".format(raw_cause))
                assert len(code_list) == 1,  \
                    "Found more than one code with value {} in code " \
                    "system {}".format(raw_cause, self.code_system_id)
                new_code_id = code_list['code_id'].iloc[0]
                new_cause_id = code_list['cause_id'].iloc[0]
                df.loc[df['raw_cause'].str.startswith(key), [
                    'restricted_cause', 'restricted_code_id',
                    'restricted_cause_id'
                ]] = [raw_cause, new_code_id, new_cause_id]
            # replace restricted_cause = "acause_diarrhea"
            # if inlist(yll_cause,"digest_ibd","digest_vascular")
            code_list = raw_causes.query('raw_cause == "acause_diarrhea"')
            assert len(code_list) == 1,  \
                "Found more than one code with value {} in code " \
                "system {}".format(raw_cause, self.code_system_id)
            new_code_id = code_list['code_id'].iloc[0]
            new_cause_id = code_list['cause_id'].iloc[0]
            # changes for digest_ibd
            df.loc[df['cause_id'] == 532, ['restricted_cause']] = raw_cause
            df.loc[df['cause_id'] == 532, ['restricted_code_id']] = new_code_id
            df.loc[df['cause_id'] == 532,
                   ['restricted_cause_id']] = new_cause_id
            # changes for digest_vascular
            df.loc[df['cause_id'] == 533, ['restricted_cause']] = raw_cause
            df.loc[df['cause_id'] == 533, ['restricted_code_id']] = new_code_id
            df.loc[df['cause_id'] == 533,
                   ['restricted_cause_id']] = new_cause_id
        # restrictions if code system is ICD9
        if self.code_system_id == 6:
            df['numeric_cause'] = pd.to_numeric(df['raw_cause'],
                                                errors='coerce')

            # 0-140 to 139.8
            new_code_id = raw_causes.query(
                "raw_cause == '139.8'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '139.8'")["cause_id"].values[0]
            df.loc[(df.numeric_cause >= 1) & (df.numeric_cause < 140), [
                'restricted_cause', 'restricted_code_id', 'restricted_cause_id'
            ]] = "139.8", new_code_id, new_cause_id

            # replace restricted_cause = "239.9" if numeric_cause >= 140
            # & numeric_cause < 240
            new_code_id = raw_causes.query(
                "raw_cause == '239.9'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '239.9'")["cause_id"].values[0]
            df.loc[(df.numeric_cause >= 140) & (df.numeric_cause < 240), [
                'restricted_cause', 'restricted_code_id', 'restricted_cause_id'
            ]] = "239.9", new_code_id, new_cause_id

            # replace restricted_cause = "459.9" if numeric_cause >= 390
            # & numeric_cause < 460
            new_code_id = raw_causes.query(
                "raw_cause == '459.9'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '459.9'")["cause_id"].values[0]
            df.loc[(df.numeric_cause >= 390) & (df.numeric_cause < 460), [
                'restricted_cause', 'restricted_code_id', 'restricted_cause_id'
            ]] = "459.9", new_code_id, new_cause_id

            # replace restricted_cause = "5199" if numeric_cause >= 460
            # & numeric_cause < 520
            new_code_id = raw_causes.query(
                "raw_cause == '519.9'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '519.9'")["cause_id"].values[0]
            df.loc[(df.numeric_cause >= 460) & (df.numeric_cause < 520), [
                'restricted_cause', 'restricted_code_id', 'restricted_cause_id'
            ]] = "519.9", new_code_id, new_cause_id

            # replace restricted_cause = "578" if numeric_cause >= 520
            # & numeric_cause < 580
            new_code_id = raw_causes.query(
                "raw_cause == '578'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '578'")["cause_id"].values[0]
            df.loc[(df.numeric_cause >= 520) & (df.numeric_cause < 580), [
                'restricted_cause', 'restricted_code_id', 'restricted_cause_id'
            ]] = "578", new_code_id, new_cause_id

            # replace restricted_cause = "E989" if substr(cause,1,1) == "E"
            new_code_id = raw_causes.query(
                "raw_cause == 'E989'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == 'E989'")["cause_id"].values[0]
            df.loc[df['raw_cause'].str.startswith("E"), [
                'restricted_cause', 'restricted_code_id', 'restricted_cause_id'
            ]] = "E989", new_code_id, new_cause_id
        assert pd.notnull(df.restricted_code_id).all()
        assert pd.notnull(df.restricted_cause_id).all()
        return df
Ejemplo n.º 13
0
    def get_computed_dataframe(self):
        """Main method to execute computations and return result.

        Notes:
        UNDECIDED HOW TO DO THIS WITHOUT ALL YEARS IN MEMORY LIKE STATA HAD

        Potential solutions:
        1. Don't do this at all, just correct ANY cause-age-sex-location-year
            that exceeds the global reference rate
              - this would potentially change results slightly, but does not
                seem unreasonable, and in fact seems more correct

        2. Prime HIV correction by assembling the list ahead of time
              - might take a long time and need to be rerun every time, which
                would essentially double the required time for this step
              - advantage is that it mimics last years results without needing
                any additional years of data
              - could eliminate some of the problems with this method by
                running it very infrequently instead of every time
                the data changes

        3. Take a 'source' argument in the class and pull the other data that
            we pulled last year to pool years necessary to generate this list

        4. Run HIV correction with all the data for a 'source' altogether, like
            the stata code did, but still update versions based on nid-year

        FOR NOW: Follow method 1 and expect to test the similarity later
        """
        keep_cols = self.df.columns

        if not self.country_needs_correction():
            print_log_message("Country doesn't need hiv correction")
            self.diag_df = None
            return self.df

        print_log_message("Getting rates df")
        rates_df = self.get_rates_df(self.cause_meta_df)
        if self.correct_garbage:
            df = add_code_metadata(self.df,
                                   add_cols=['value'],
                                   code_system_id=self.code_system_id,
                                   force_rerun=False,
                                   block_rerun=True,
                                   cache_dir=self.cache_dir)
            df = self.identify_sepsis_gc(df, self.code_system_id)
            df = self.identify_injury_gc(df, self.code_system_id)
            df = self.identify_hivrd_gc(df, self.code_system_id)
            # do a groupby to collapse down to cause_id level for next steps
            group_cols = [
                x for x in keep_cols if x not in ['code_id', 'deaths']
            ]
            df_by_code = df.copy()
            df_by_cause = df.groupby(group_cols,
                                     as_index=False)['deaths'].sum()
        else:
            df_by_cause = self.df
        df = add_population(df_by_cause, pop_df=self.pop_df)
        print_log_message("Flagging correct dem groups for "
                          "{0} rows of data".format(len(df)))
        df = flag_correct_dem_groups(df,
                                     self.code_system_id,
                                     self.cause_meta_df,
                                     self.loc_meta_df,
                                     self.age_meta_df,
                                     rates_df,
                                     self.reference_ages,
                                     self.move_gc_age_restrictions,
                                     self.value_cols,
                                     self.pop_col,
                                     self.cause_selections_path,
                                     correct_garbage=self.correct_garbage)
        cause_to_targets_map = self.get_cause_to_targets_map(
            self.cause_meta_df)
        print_log_message("Identifying positive excess")
        df = identify_positive_excess(df, rates_df, cause_to_targets_map,
                                      self.reference_ages, self.loc_meta_df,
                                      self.cause_meta_df, self.value_cols,
                                      self.pop_col, self.correct_garbage)
        if self.correct_garbage:
            df = self.calculate_garbage_positive_excess(
                df, df_by_code, group_cols)
            print_log_message("Moving excess to target")
            df = move_excess_to_target(df, self.value_cols,
                                       cause_to_targets_map,
                                       self.correct_garbage)
            computed_df = assign_code_to_created_target_deaths(
                df, self.code_system_id, self.cause_meta_df)
        else:
            print_log_message("Moving excess to target")
            computed_df = move_excess_to_target(df, self.value_cols,
                                                cause_to_targets_map,
                                                self.correct_garbage)
        self.diag_df = computed_df
        return computed_df[keep_cols]
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):

    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)

        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)

    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")

    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)

    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        wait('claude_redistributionworker_{}'.format(nid), 30)
        print_log_message("Done waiting. Appending them together")
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)

    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Ejemplo n.º 15
0
    def special_cause_reassignment(self, df, code_system_id):
        """Replace the actual data cause under certain conditions.

        This essentially allows mapping based on not just the cause
        and code system but based on other information like
        the location, NID, year, etc.

        Args:
            df (DataFrame): data with cause

        Returns:
            DataFrame: with any modifications
        """

        cache_args = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': 'standard',
            'cache_results': False
        }
        # Some SRS codes get redistributed differently than
        # other ICD10 datasets
        df = add_nid_metadata(df, 'source', **cache_args)

        if (df['source'] == "India_SRS_states_report").any():
            print_log_message("Changing SRS codes to custom garbage groups")
            assert (df['source'] == "India_SRS_states_report").all()

            df = add_code_metadata(df,
                                   'value',
                                   code_system_id=code_system_id,
                                   **cache_args)

            custom_grbg = pd.read_csv(
                self.cg.get_resource("srs_custom_garbage_groups"))
            custom_grbg = custom_grbg.query('active == 1')
            custom_grbg['value'] = custom_grbg['srs_custom_garbage_group']
            custom_grbg = add_code_metadata(custom_grbg,
                                            'code_id',
                                            code_system_id=code_system_id,
                                            merge_col='value',
                                            **cache_args)
            custom_grbg = custom_grbg.rename(
                columns={'code_id': 'new_code_id'})
            custom_grbg = custom_grbg[['package_id', 'new_code_id']]

            gp_dfs = []
            for package_id in custom_grbg.package_id.unique():
                gp_df = get_garbage_from_package(code_system_id,
                                                 package_id,
                                                 package_arg_type="package_id")
                assert len(gp_df) != 0, \
                    "Found 0 codes for package {}".format(package_id)
                gp_dfs.append(gp_df)
            gp_df = pd.concat(gp_dfs, ignore_index=True)

            gp_df = gp_df.merge(custom_grbg, how='left')
            report_if_merge_fail(gp_df, 'new_code_id', 'package_id')
            gp_df = gp_df[['value', 'new_code_id']]
            gp_df['value'] = gp_df['value'].str.strip()

            df = df.merge(gp_df, how='left', on='value')
            df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id']
            df['code_id'] = df['code_id'].astype(int)
            df = df.drop(['new_code_id', 'value'], axis=1)

        df = df.drop('source', axis=1)

        china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2)

        five_dig_code = df['code_id'] == 13243
        df.loc[china_cdc_2008 & five_dig_code, 'code_id'] = 13242

        return df
Ejemplo n.º 16
0
def run_phase(df,
              csvid,
              nid,
              extract_type_id,
              lsvid,
              pop_run_id,
              cmvid,
              launch_set_id,
              remove_decimal,
              write_diagnostics=True):
    """String together processes for redistribution."""

    # what to do about caching throughout the phase
    read_file_cache_options = {
        'block_rerun': True,
        'cache_dir': CACHE_DIR,
        'force_rerun': False,
        'cache_results': False
    }

    # the iso3 of this data
    iso3 = get_value_from_nid(nid,
                              'iso3',
                              extract_type_id=extract_type_id,
                              location_set_version_id=lsvid)

    # the code system id
    code_system_id = int(
        get_value_from_nid(nid,
                           'code_system_id',
                           extract_type_id=extract_type_id))

    # the data type
    data_type_id = get_value_from_nid(nid,
                                      'data_type_id',
                                      extract_type_id=extract_type_id)

    # cause map
    cause_map = get_cause_map(code_map_version_id=cmvid,
                              **read_file_cache_options)

    orig_deaths_sum = int(df['deaths'].sum())

    if remove_decimal:
        print_log_message("Removing decimal from code map")
        cause_map['value'] = cause_map['value'].apply(
            lambda x: x.replace(".", ""))

    if needs_garbage_correction(iso3, data_type_id):
        print_log_message("Correcting Garbage for {}".format(iso3))
        orig_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())

        cause_meta_df = get_current_cause_hierarchy(cause_set_version_id=csvid,
                                                    **read_file_cache_options)

        # get age group ids
        age_meta_df = get_ages(**read_file_cache_options)

        loc_meta_df = get_current_location_hierarchy(
            location_set_version_id=lsvid, **read_file_cache_options)

        pop_meta_df = get_pop(pop_run_id=pop_run_id, **read_file_cache_options)
        # Move garbage to hiv first
        hiv_corrector = HIVCorrector(df,
                                     iso3,
                                     code_system_id,
                                     pop_meta_df,
                                     cause_meta_df,
                                     loc_meta_df,
                                     age_meta_df,
                                     correct_garbage=True)
        df = hiv_corrector.get_computed_dataframe()
        after_gc_sum = int(df.query('cause_id == 743')['deaths'].sum())
        after_deaths_sum = int(df['deaths'].sum())
        print_log_message("""
            Stage [gc deaths / total deaths]
            Before GC correction [{gco} / {to}]
            After GC correction [{gca} / {ta}]
        """.format(gco=orig_gc_sum,
                   to=orig_deaths_sum,
                   gca=after_gc_sum,
                   ta=after_deaths_sum))

    df = add_code_metadata(df, ['value', 'code_system_id'],
                           code_map=cause_map,
                           **read_file_cache_options)
    # recognizing that it is weird for code_system_id to come from two places,
    # make sure they are consistent
    assert (df['code_system_id'] == code_system_id).all(), "Variable code " \
        "system id {} did not agree with all values of df code " \
        "system id: \n{}".format(
            code_system_id, df.loc[df['code_system_id'] != code_system_id])

    print_log_message("Formatting data for redistribution")
    # do we have all the packages we need?
    # verify_packages(df)
    # format age groups to match package parameters
    df = format_age_groups(df)
    # drop observations with 0 deaths
    df = drop_zero_deaths(df)
    # merge on redistribution location hierarchy
    df = add_rd_locations(df, lsvid)
    # fill in any missing stuff that may have come from rd hierarchy
    df = fill_missing_df(df, verify_all=True)
    # create split groups

    # NO SPLIT GROUP NEEDED
    df = add_split_group_id_column(df)

    # final check to make sure we have all the necessary columns
    df = format_columns_for_rd(df, code_system_id)

    split_groups = list(df.split_group.unique())
    parallel = len(split_groups) > 1

    print_log_message("Submitting/Running split groups")
    for split_group in split_groups:
        # remove intermediate files from previous run
        delete_split_group_output(nid, extract_type_id, split_group)
        # save to file
        split_df = df.loc[df['split_group'] == split_group]
        write_split_group_input(split_df, nid, extract_type_id, split_group)
        # submit jobs or just run them here
        if parallel:
            submit_split_group(nid, extract_type_id, split_group,
                               code_system_id, launch_set_id)
        else:
            worker_main(nid, extract_type_id, split_group, code_system_id)
    if parallel:
        print_log_message("Waiting for splits to complete...")
        # wait until all jobs for a given nid have completed
        # eventually need logic for files not being present
        wait('claude_redistributionworker_{}'.format(nid), 30)
        # This seems to be necessary to wait for files
        print_log_message("Done waiting. Appending them together")
    # append split groups together
    df = read_append_split_groups(split_groups, nid, extract_type_id,
                                  cause_map)

    print_log_message("Done appending files - {} rows assembled".format(
        len(df)))
    df = revert_variables(df)

    after_deaths_sum = int(df['deaths'].sum())
    before_after_text = """
        Before GC redistribution: {a}
        After GC redistribution: {b}
    """.format(a=orig_deaths_sum, b=after_deaths_sum)
    diff = abs(orig_deaths_sum - after_deaths_sum)
    # bad if change 2% or 5 deaths, whichever is greater
    # (somewhat arbitrary, just trying to avoid annoying/non-issue failures)
    diff_threshold = max(.02 * orig_deaths_sum, 5)
    if not diff < diff_threshold:
        raise AssertionError("Deaths not close.\n" + before_after_text)
    else:
        print_log_message(before_after_text)

    return df
Ejemplo n.º 17
0
    def set_restricted_cause(self, df):
        mapping_icd10 = {'A': 'B99.9', 'B': 'B99.9', 'C': 'D49.9',
                         'D': 'D49.9', 'I': 'I99.9', 'J': 'J98.9',
                         'K': 'K92.9', 'V': 'Y89', 'Y': 'Y89'}

        df = add_code_metadata(
            df, ['value'], self.code_system_id,
            **self.standard_cache_options
        )
        report_if_merge_fail(df, 'value', 'code_id')
        df = df.rename(columns={'value': 'raw_cause'})

        raw_causes = self.prep_code_metadata()
        assert "ZZZ" in raw_causes.raw_cause.unique(), \
            "ZZZ must be in the map"
        df['restricted_cause'] = "ZZZ"
        df['restricted_code_id'] = raw_causes.query(
            "raw_cause == 'ZZZ'")["code_id"].values[0]
        df['restricted_cause_id'] = raw_causes.query(
            "raw_cause == 'ZZZ'")["cause_id"].values[0]


        if self.code_system_id == 1:
            for key in mapping_icd10.keys():
                raw_cause = mapping_icd10[key]
                code_list = raw_causes.query(
                    "raw_cause == '{}'".format(raw_cause))
                assert len(code_list) == 1,  \
                    "Found more than one code with value {} in code " \
                    "system {}".format(raw_cause, self.code_system_id)
                new_code_id = code_list['code_id'].iloc[0]
                new_cause_id = code_list['cause_id'].iloc[0]
                df.loc[df['raw_cause'].str.startswith(key),
                       ['restricted_cause', 'restricted_code_id',
                        'restricted_cause_id']] = [raw_cause,
                                                   new_code_id, new_cause_id]

            code_list = raw_causes.query('raw_cause == "acause_diarrhea"')
            assert len(code_list) == 1,  \
                "Found more than one code with value {} in code " \
                "system {}".format(raw_cause, self.code_system_id)
            new_code_id = code_list['code_id'].iloc[0]
            new_cause_id = code_list['cause_id'].iloc[0]
            df.loc[df['cause_id'] == 532,
                   ['restricted_cause']] = raw_cause
            df.loc[df['cause_id'] == 532,
                   ['restricted_code_id']] = new_code_id
            df.loc[df['cause_id'] == 532,
                   ['restricted_cause_id']] = new_cause_id
            df.loc[df['cause_id'] == 533,
                   ['restricted_cause']] = raw_cause
            df.loc[df['cause_id'] == 533,
                   ['restricted_code_id']] = new_code_id
            df.loc[df['cause_id'] == 533,
                   ['restricted_cause_id']] = new_cause_id
        if self.code_system_id == 6:
            df['numeric_cause'] = pd.to_numeric(
                df['raw_cause'], errors='coerce')

            new_code_id = raw_causes.query(
                "raw_cause == '139.8'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '139.8'")["cause_id"].values[0]
            df.loc[
                (df.numeric_cause >= 1) & (df.numeric_cause < 140),
                ['restricted_cause', 'restricted_code_id',
                 'restricted_cause_id']
            ] = "139.8", new_code_id, new_cause_id

            new_code_id = raw_causes.query(
                "raw_cause == '239.9'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '239.9'")["cause_id"].values[0]
            df.loc[
                (df.numeric_cause >= 140) & (df.numeric_cause < 240),
                ['restricted_cause', 'restricted_code_id',
                 'restricted_cause_id']
            ] = "239.9", new_code_id, new_cause_id

            new_code_id = raw_causes.query(
                "raw_cause == '459.9'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '459.9'")["cause_id"].values[0]
            df.loc[
                (df.numeric_cause >= 390) & (df.numeric_cause < 460),
                ['restricted_cause', 'restricted_code_id',
                 'restricted_cause_id']
            ] = "459.9", new_code_id, new_cause_id

            new_code_id = raw_causes.query(
                "raw_cause == '519.9'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '519.9'")["cause_id"].values[0]
            df.loc[
                (df.numeric_cause >= 460) & (df.numeric_cause < 520),
                ['restricted_cause', 'restricted_code_id',
                 'restricted_cause_id']
            ] = "519.9", new_code_id, new_cause_id

            new_code_id = raw_causes.query(
                "raw_cause == '578'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == '578'")["cause_id"].values[0]
            df.loc[(df.numeric_cause >= 520) & (df.numeric_cause < 580),
                   ['restricted_cause', 'restricted_code_id',
                    'restricted_cause_id']] = "578", new_code_id, new_cause_id

            new_code_id = raw_causes.query(
                "raw_cause == 'E989'")["code_id"].values[0]
            new_cause_id = raw_causes.query(
                "raw_cause == 'E989'")["cause_id"].values[0]
            df.loc[df['raw_cause'].str.startswith("E"),
                   ['restricted_cause', 'restricted_code_id',
                    'restricted_cause_id']] = "E989", new_code_id, new_cause_id
        assert pd.notnull(df.restricted_code_id).all()
        assert pd.notnull(df.restricted_cause_id).all()
        return df
Ejemplo n.º 18
0
def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=False,
                        refresh_cache=True,
                        check_ages=True):
    """Finalize the formatting of the source and optionally write it out.

    Decides whether to map code_id based on whether code_id is already a
        column in the dataset.

    Needs the following information from either the df values or from the
        nid_meta_vals dict:

            data_type_id
            representative_id

        All of the above must have only one value per nid in df.

    Maps site_id to the data based on incoming 'site' column. Will upload
        any sites that are not in the cod.site table already.

    Arguments:
        df, pandas.DataFrame: The dataframe with near-formatted data
        source, str: The source this df is (should be the whole source and
            nothing but the source). Will break if there is no source in
            FILEPATH with this name, and you should pass the
            source without a leading underscore even if it is that way
            in J
        write, bool: whether to write the outputs
        extract_type, str: The manner in which the nid was extracted. If
            left as None, will be induced by the location_type_id of
            the location_id with the maximum level in the dataset. This should
            be over-ridden in cases like China DSP, where the same locations
            are used in two extraction types - "DSP + VR" and "DSP"; China DSP
            then gets two extraction types: "admin1" and
            "admin1: DSP sites only" (in the particular instance of DSP,
            extract type is built into this code. Feel free to add other
            source-extract type mappings here to force consistency.)
        check_ages, bool: Whether or not to enforce age group checks such as
            ensuring no overlaps or gaps. This can be turned off because sometimes
            raw data reports overlapping age groups (e.g. Palestine data has Gaza Strip and West
            Bank data with different age groupings).

    Returns:
        Every local value to the function
        Why? There are multiple df outputs, and formatting is a very
        engaged process so its helpful to just see everything sometimes
    """
    # set column groups, and verify that we have everything we need
    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active', 'is_mort_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    assert len(missing_cols) == 0, \
        "Required formatting columns not found in df: \n{}".format(missing_cols)

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM engine_room.maps_code
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='ADDRESS')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        assert len(bad_codes) == 0, "Found code ids in data that can't exist in code "\
                                    "systems {}: {}".format(code_system_ids, bad_codes)
    check_vr_raw_causes(df)

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)
    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    # sorry for putting this here
    # drop these loc/years b/c env < deaths creating negative cc_code
    # maybe re run w/ another envelope?
    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    ################################################
    # keep all 0s now, messing up for NR in non-VR
    # df['val_sum_tmp'] = df[VALUE_COLS].sum(axis=1)
    # all-cause extractions want to keep zeroes
    # keep_zeroes = df['extract_type_id'] == ALL_CAUSE_EXTRACT_ID
    # otherwise, drop them
    # greater_than_zero = df['val_sum_tmp'] > 0
    # df = df[greater_than_zero | keep_zeroes]
    # df = df.drop('val_sum_tmp', axis=1)
    ################################################

    # CHECKS FOR FORMATTED PHASE OUTPUT
    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # TESTS F0R CHECKING AGE GROUP IDS
    if check_ages:
        check_age_groups(df)

    # MORE TESTS FOR DEATHS - MAYBE THAT THEY AREN'T MORE THAN 1.25 THE
    # VALUE IN THE ENVELOPE BY LOCATION AGE YEAR SEX?

    # AND THEN WRITE A TABLE OF COMPARISONS OF DEATHS / ENVELOPE BY LOCATION
    # AGE YEAR SEX FOR REVIEW

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    if is_active is True:
        warnings.warn(
            """is_active is deprecated: use the update_nid_metadata_status
                         function to change the status of finalized datasets"""
        )

    # Use existing is_active and is_mort_active values, otherwise default to 0
    nid_map = pull_nid_metadata()
    df = df.merge(nid_map,
                  on=[
                      'nid', 'parent_nid', 'extract_type_id', 'source',
                      'data_type_id', 'code_system_id'
                  ],
                  how='left')

    df_na = df[pd.isnull(df['is_active'])]
    df_na = df_na[['nid', 'extract_type_id']].drop_duplicates()

    if df_na.shape[0] > 0:
        print("""New rows for the following NID/extract_type_id will be added
                 with is_active and is_mort_active = 0:\n {}""".format(df_na))

    df['is_active'] = df['is_active'].fillna(0)
    df['is_mort_active'] = df['is_mort_active'].fillna(0)

    # CHECK SUBNATIONAL LOCATIONS
    df = check_subnational_locations(df)

    # OVERRIDE REPRESENTATIVE ID FOR NON-VR
    df = adjust_representative_id(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_formatted_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_formatted_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        # write to cod.source for new sources
        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        if refresh_cache:
            refresh_claude_nid_cache_files()

    return locals()
Ejemplo n.º 19
0
    def special_cause_reassignment(self, df, code_system_id):
        """Replace the actual data cause under certain conditions.

        There are instances where a PI has good reason to
        believe that a certain group of deaths were assigned
        to the wrong cause, and it is known what cause to re-assign
        those deaths to. Implement here.

        This essentially allows mapping based on not just the cause
        and code system but based on other information like
        the location, NID, year, etc.

        It can also be used (sparingly) for hotfixes like
        changing all codes with values 'acause_digest_gastrititis'
        to be named 'acause_digest_gastritis'.

        Args:
            df (DataFrame): data with cause

        Returns:
            DataFrame: with any modifications
        """

        cache_args = {
            'force_rerun': False,
            'block_rerun': True,
            'cache_dir': 'standard',
            'cache_results': False
        }
        # Some SRS codes get redistributed differently than
        # other ICD10 datasets
        df = add_nid_metadata(
            df, 'source', **cache_args
        )

        if (df['source'] == "India_SRS_states_report").any():
            print_log_message("Changing SRS codes to custom garbage groups")
            assert (df['source'] == "India_SRS_states_report").all()

            df = add_code_metadata(
                df, 'value', code_system_id=code_system_id,
                **cache_args
            )

            custom_grbg = pd.read_csv(
                self.cg.get_resource("srs_custom_garbage_groups")
            )
            custom_grbg = custom_grbg.query('active == 1')
            custom_grbg['value'] = custom_grbg['srs_custom_garbage_group']
            custom_grbg = add_code_metadata(
                custom_grbg, 'code_id', code_system_id=code_system_id,
                merge_col='value', **cache_args
            )
            custom_grbg = custom_grbg.rename(
                columns={'code_id': 'new_code_id'})
            custom_grbg = custom_grbg[['package_id', 'new_code_id']]

            gp_dfs = []
            for package_id in custom_grbg.package_id.unique():
                # THIS QUERIES THE DATABASE - BUT THERE SHOULD NEVER BE A TON
                # OF SRS JOBS HAPPENING AT ONCE SO IT SHOULD BE OK
                gp_df = get_garbage_from_package(
                    code_system_id, package_id, package_arg_type="package_id"
                )
                assert len(gp_df) != 0, \
                    "Found 0 codes for package {}".format(package_id)
                gp_dfs.append(gp_df)
            gp_df = pd.concat(gp_dfs, ignore_index=True)

            gp_df = gp_df.merge(custom_grbg, how='left')
            report_if_merge_fail(gp_df, 'new_code_id', 'package_id')
            gp_df = gp_df[['value', 'new_code_id']]
            gp_df['value'] = gp_df['value'].str.strip()

            df = df.merge(gp_df, how='left', on='value')
            df.loc[df['new_code_id'].notnull(), 'code_id'] = df['new_code_id']
            df['code_id'] = df['code_id'].astype(int)
            df = df.drop(['new_code_id', 'value'], axis=1)

        df = df.drop('source', axis=1)

        china_cdc_2008 = (df['nid'] == 270005) & (df['extract_type_id'] == 2)
        # J96.00 - move five to four digit J96.0 (this should be a rule in formatting, only keep 4 digit detail)
        five_dig_code = df['code_id'] == 13243
        df.loc[
            china_cdc_2008 & five_dig_code,
            'code_id'
        ] = 13242

        return df
Ejemplo n.º 20
0
def finalize_formatting(df,
                        source,
                        write=False,
                        code_system_id=None,
                        extract_type=None,
                        conn_def='ADDRESS',
                        is_active=True):

    NID_META_COLS = [
        'nid', 'parent_nid', 'extract_type_id', 'source', 'data_type_id',
        'code_system_id', 'is_active'
    ]
    NID_LOCATION_YEAR_COLS = [
        'nid', 'extract_type_id', 'location_id', 'year_id', 'representative_id'
    ]
    FORMATTED_ID_COLS = [
        'nid', 'extract_type_id', 'code_id', 'sex_id', 'site_id', 'year_id',
        'age_group_id', 'location_id'
    ]
    if 'code_id' in df.columns:
        code_col = 'code_id'
        map_code_id = False
    elif 'cause' in df.columns:
        code_col = 'cause'
        map_code_id = True
    else:
        raise AssertionError("Need either 'code_id' or 'cause' in columns")
    INCOMING_EXPECTED_ID_COLS = [
        'nid', 'location_id', 'year_id', 'age_group_id', 'sex_id', code_col,
        'site', 'data_type_id', 'representative_id', 'code_system_id'
    ]
    VALUE_COLS = ['deaths']
    FINAL_FORMATED_COLS = FORMATTED_ID_COLS + VALUE_COLS

    missing_cols = set(INCOMING_EXPECTED_ID_COLS) - set(df.columns)
    if len(missing_cols) > 0:
        raise AssertionError(
            """These columns are needed for formatting but not found in df:

            {}
            """.format(missing_cols))

    # SET FORMATTING TIMESTAMP
    format_timestamp = cod_timestamp()
    print("Finalizing formatting with timestamp {}".format(format_timestamp))

    # ADD SOURCE
    df['source'] = source

    # MAP OR CHECK CODE ID
    code_system_ids = df.code_system_id.unique()
    if map_code_id:
        cs_dfs = []
        for code_system_id in code_system_ids:
            cs_df = df.loc[df['code_system_id'] == code_system_id].copy()
            # map code_id to the data
            cs_df['value'] = cs_df['cause']
            cs_df = add_code_metadata(cs_df, ['code_id'],
                                      code_system_id=code_system_id,
                                      merge_col='value',
                                      force_rerun=True,
                                      cache_dir='standard')
            print(cs_df.loc[cs_df['code_id'].isnull()].value.unique())
            report_if_merge_fail(cs_df, ['code_id'], ['value'])
            cs_df = cs_df.drop('value', axis=1)
            cs_dfs.append(cs_df)
        df = pd.concat(cs_dfs, ignore_index=True)
    else:
        # ADD TEST TO CHECK THAT EVERY CODE_ID IS IN THE ENGINE ROOM AND IN THE
        # CODE SYSTEM
        all_codes_q = """
            SELECT code_id
            FROM ADDRESS
            WHERE code_system_id IN ({})
        """.format(",".join([str(c) for c in code_system_ids]))
        all_codes = ezfuncs.query(all_codes_q, conn_def='engine')
        bad_codes = set(df.code_id) - set(all_codes.code_id)
        if len(bad_codes) > 0:
            print("Found these code ids in data that can't exist in code "
                  "systems {}: {}".format(code_system_ids, bad_codes))

    # MAP SITE ID
    df = map_site_id(df, conn_def=conn_def)
    # MAP EXTRACT TYPE ID
    df = map_extract_type_id(df, source, extract_type, conn_def=conn_def)

    # CHANGE SIX MINOR TERRITORIES TO AGGREGATE UNION LOCATIONS
    df = group_six_minor_territories(df, sum_cols=VALUE_COLS)

    df = df.loc[~((df['nid'] == 279644) & (df['year_id'] == 2011))]
    df = df.loc[~(df['nid'].isin([24143, 107307]))]

    # ENSURE NO NEGATIVES
    for val_col in VALUE_COLS:
        assert (df[val_col] >= 0).all(), \
            "there are negative values in {}".format(val_col)

    input_df = df[FINAL_FORMATED_COLS].copy()
    assert not input_df.isnull().values.any(), "null values in df"
    dupped = input_df[input_df.duplicated()]
    if len(dupped) > 0:
        raise AssertionError("duplicate values in df: \n{}".format(dupped))

    # GROUP IF NECESSARY
    if input_df[FORMATTED_ID_COLS].duplicated().any():
        input_df = input_df.groupby(FORMATTED_ID_COLS,
                                    as_index=False)[VALUE_COLS].sum()

    # MAKE NID METADATA TABLE
    if 'parent_nid' not in df.columns:
        df['parent_nid'] = np.nan

    df['is_active'] = 1 * is_active

    # CHECK SUBNATIONAL LOCATIONS
    # alters is_active if needed
    df = check_subnational_locations(df)

    nid_meta_df = df[NID_META_COLS].drop_duplicates()
    nid_meta_df['last_updated_timestamp'] = format_timestamp

    # MAKE NID LOCATION YEAR TABLE
    nid_locyears = df[NID_LOCATION_YEAR_COLS].drop_duplicates()
    nid_locyears['last_updated_timestamp'] = format_timestamp
    # check one iso3 per nid
    nid_locyears = add_location_metadata(nid_locyears, 'ihme_loc_id')
    nid_locyears['iso3'] = nid_locyears['ihme_loc_id'].str.slice(0, 3)
    report_duplicates(
        nid_locyears[['nid', 'extract_type_id', 'iso3']].drop_duplicates(),
        ['nid', 'extract_type_id'])
    nid_locyears = nid_locyears.drop(['ihme_loc_id', 'iso3'], axis=1)

    if write:
        # write nid metadata
        write_to_claude_nid_table(nid_meta_df,
                                  'claude_nid_metadata',
                                  replace=True,
                                  conn_def=conn_def)

        # write nid location-year map
        write_to_claude_nid_table(nid_locyears,
                                  'claude_nid_location_year',
                                  replace=True,
                                  conn_def=conn_def)

        insert_source_id(source)

        nid_extracts = input_df[['nid', 'extract_type_id'
                                 ]].drop_duplicates().to_records(index=False)
        for nid, extract_type_id in nid_extracts:
            nid = int(nid)
            extract_type_id = int(extract_type_id)
            print("Writing nid {}, extract_type_id {}".format(
                nid, extract_type_id))
            idf = input_df.loc[(input_df['nid'] == nid) & (
                input_df['extract_type_id'] == extract_type_id)].copy()
            phase = 'formatted'
            launch_set_id = format_timestamp
            print("\nTotal deaths: {}".format(idf.deaths.sum()))
            write_phase_output(idf, phase, nid, extract_type_id, launch_set_id)

        # now refresh cache files for nid
        print("\nRefreshing claude nid metadata cache files")
        force_cache_options = {
            'force_rerun': True,
            'block_rerun': False,
            'cache_dir': "standard",
            'cache_results': True,
            'verbose': True
        }
        get_nid_metadata(**force_cache_options)
        get_nidlocyear_map(**force_cache_options)

    return locals()