def validate_date_vars(self, row, date_vars):
        """Try to get an approximate date by replacing invalid values with defaults.

        Args:
            row (dict): Row of VA data.
            date_vars (dict): Answers which contain date data.
        """
        # Get an approximate date.
        # Add 'd' (day) 'm' (month) 'y' (years) to each var and process.
        date_invalid = {
            'd': (['', '99', 99], 1),
            'm': (['', '99', 99], 1),
            'y': (['', '999', 999, '9999', 9999], 0),
        }
        for var in date_vars:
            for val, val_data in date_invalid.items():
                var_name = var + val
                invalid_data, default = val_data
                try:
                    if row[var_name] in invalid_data:
                        row[var_name] = default
                except KeyError as e:
                    warning_logger.debug(
                        'SID: {} variable \'{}\' does not exist. validate_date_vars'
                        .format(row['sid'], e.message))
                    continue
    def process_binary_vars(row, conversion_map):
        """
        Convert multiple value answers into binary cells.

        :param row: Row of data.
        :param conversion_map: Data structure with header and binary variable mapping.
        """
        for data_header, data_map in conversion_map:
            try:
                for value in map(int,
                                 str(row[data_header]).strip().split(' ')):
                    if isinstance(data_map, dict):
                        if value in data_map:
                            row[data_map[value]] = 1
                    elif isinstance(data_map, list):
                        row[data_header] = int(value in data_map)
                    elif isinstance(data_map, str):
                        row[data_header] = int(
                            LdapNotationParser(data_map, get_cell(row),
                                               int).evaluate())
            except ValueError:
                # No values to process or not an integer value (invalid).
                pass
            except ConversionError as e:
                warning_logger.debug(e.message)
                continue
            except KeyError as e:
                # Variable does not exist. The new published form does not contain all of the previous variables.
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. process_binary_vars'
                    .format(row['sid'], e.message))
                continue
    def process_age_vars(self, row):
        """Calculate and store age in years, months, and days.

        Args:
            row (dict): Row of VA data.
        """
        for age_var in AGE_VARS:
            try:
                years = value_or_default(row['{:s}a'.format(age_var)], float,
                                         [999, 9999])
                months = value_or_default(row['{:s}b'.format(age_var)], float,
                                          99)
                days = value_or_default(row['{:s}c'.format(age_var)], float,
                                        99)
                row['{:s}a'.format(age_var)] = years + (months /
                                                        12.0) + (days / 365.0)
                row['{:s}b'.format(age_var)] = (12.0 *
                                                years) + months + (days / 30.0)
                row['{:s}c'.format(age_var)] = (365.0 *
                                                years) + (30.0 * months) + days
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. process_age_vars'.
                    format(row['sid'], e.message))
                continue
    def process_progressive_value_data(row, progressive_data):
        """
        Populate progressive variables from input data.
        Format:
        {
            'read variable': [
                (upper, variable),
                (median, variable),
                (lower, variable),
                (0, variable)
            ]
        }

        :param row: Row of data.
        :param progressive_data: Quartile ranges in specified format.
        """
        for read_header, conversion_data in progressive_data:
            for value, write_header in conversion_data:
                try:
                    if safe_float(row[read_header]) > value:
                        if isinstance(write_header, tuple):
                            write_header, write_value = write_header
                        else:
                            write_value = 1
                        row[write_header] = write_value
                        break
                except KeyError as e:
                    warning_logger.debug(
                        'SID: {} variable \'{}\' does not exist. process_progressive_value_data'
                        .format(row['sid'], e.message))
                    continue
    def convert_rash_data(self, row, conversion_data):
        """Specialized method to convert rash data into variables based on multiple choice questions.
        Split and store values from a space delimited list of integers in intermediate variables.
        If the three locations [1 (face), 2 (trunk), 3 (extremities)] values are specified, change answer to 4 (Everywhere).

        Conversion data format:
            {
                '#read_var': {
                    'vars': [quoted list of write vars],
                    'locations': [list of location values],
                    'everywhere': 4
                }
            }

        Args:
            row (dict): Row of VA data.
            conversion_data (dict): Data structure with header and rash specific variable mapping.
        """
        for variable, mapping in conversion_data.items():
            try:
                rash_values = set(map(int, row[variable].split(' ')))
            except ValueError:
                # No rash data. Continue.
                continue
            except KeyError as e:
                # Variable does not exist.
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. convert_rash_data'
                    .format(row['sid'], e.message))
                continue
            else:
                locations = set(mapping['locations'])
                if rash_values.difference(mapping['values']):
                    # Treat the entire field as invalid if any invalid values
                    # are present
                    rash_values = set()

                if mapping['everywhere'] in rash_values or locations.issubset(
                        rash_values):
                    # if all locations are selected, then change the value to 'everywhere'
                    rash_values = {mapping['everywhere']}

                # FIXME: brittle/hacky handling of rash location
                # This happens to work for all the wrong reasons. We actually
                # are going to ignore all the values except those in the first
                # two listed variables (rashlocation3 doesn't map through to the
                # symptoms). We only care about the values 1 through 4. If 4 is
                # present it should be the only value. If 1,2,3 were selected it
                # should be 4 by now. Sorting ensures the subset of 1,2,3 if
                # present, are captured in the first two output variables.
                # Values of 5, 8, 9 may be listed at the end (rashlocation3 to
                # rashlocation5) and will be inadvertantly dropped. Yup. Hacky.
                for index, value in enumerate(sorted(rash_values)):
                    row[mapping['vars'][index]] = value
    def fix_rash_location(self, row):
        """Only rashes which are located on the face are relevant. Filter out other values.

        Args:
            row: Row of VA data.
        """
        if self.AGE_GROUP == common_data.CHILD:
            try:
                for var in ['c4_31_1', 'c4_32']:
                    row[var] = int('1' in str(row[var]).split())
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. fix_rash_location'
                    .format(row['sid'], e.message))
    def fix_rash_length(self, row):
        """Only consider values in days for child rash data.

        Args:
            row: Row of VA data.
        """
        try:
            if int(row['c4_33a']) != 4:
                row['c4_33b'] = 0
        except ValueError:
            pass
        except KeyError as e:
            warning_logger.debug(
                'SID: {} variable \'{}\' does not exist. fix_rash_length'.
                format(row['sid'], e.message))
    def validate_weight_vars(self, row, weight_vars):
        """Replace invalid weight data with a default value.

        Args:
            row (dict): Row of VA data.
            weight_vars (list): Answers which contain weight data.
        """
        for var in weight_vars:
            try:
                row[var] = value_or_default(row[var], int, [0, 9999], '')
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. validate_weight_vars'
                    .format(row['sid'], e.message))
                continue
    def calculate_age_at_death_value(self, row):
        """Write age at death value to the appropriate variable.

        Args:
            row (dict): Row of VA data.
        """
        if self.AGE_GROUP == common_data.NEONATE:
            try:
                value = value_or_default(row['c1_25b'], int, default=None)
                if 1 <= value <= 28:
                    row['c1_26'] = 1
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. calculate_age_at_death_value'
                    .format(row['sid'], e.message))
        elif self.AGE_GROUP == common_data.CHILD:
            row['c1_26'] = 2
    def fill_missing_data(self, row, default_fill):
        """Fill missing data with default fill values.

        Args:
            row (dict): Row of VA data.
            default_fill (dict): Dictionary of headers and default values.
        """
        for variable, value in default_fill.items():
            try:
                if row[variable] == '':
                    row[variable] = value
            except KeyError as e:
                # Variable does not exist.
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. fill_missing_data'
                    .format(row['sid'], e.message))
                continue
    def fix_agedays(self, row):
        """Fix child agedays.  If it's blank give it a 0, if it's not, give it a 4.

        Args:
            row (dict): Row of VA data.
        """
        if self.AGE_GROUP in (common_data.CHILD, common_data.NEONATE):
            try:
                value = value_or_default(row['c1_25b'], int, default=None)
                if value is None:
                    row['c1_25a'] = 0
                else:
                    row['c1_25a'] = 4
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. fix_agedays'.
                    format(row['sid'], e.message))
    def convert_free_text_vars(self, row, data_headers, word_map):
        """Process all free text data from a list of data headers into binary variables.

        Args:
            row (dict): Row of VA data.
            data_headers (list): Answers which contain free text data.
            word_map (dict): Dictionary of words and variables.
        """
        for data_header in data_headers:
            try:
                if row[data_header]:
                    word_list = row[data_header].split(' ')
                    self.convert_free_text_words(row, word_list, word_map)
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. convert_free_text_vars'
                    .format(row['sid'], e.message))
                continue
Example #13
0
    def copy_variables(self, row, copy_variables_map):
        """Copy data from one variable to another.

        Copy Variables Map Format:
            {
                'read variable': 'write variable'
            }

        Args:
            row (dict): Row of VA data.
            copy_variables_map (dict): Read and write answer variables.
        """
        for read_header, write_header in copy_variables_map.items():
            try:
                row[write_header] = row[read_header]
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. copy_variables'.
                    format(row['sid'], e.message))
Example #14
0
    def process_cutoff_data(self, row, cutoff_data_map):
        """Change read variable to 1/0 if value is greater/less or equal to cutoff, respectively.

        Cutoff data map Format:
            {
                'read variable': cutoff value
            }

        Args:
            row (dict): Row of VA data.
            cutoff_data_map (dict): Cutoff data in specified format.
        """
        for read_header, cutoff_data in cutoff_data_map:
            try:
                row[read_header] = int(float(row[read_header]) >= cutoff_data)
            except ValueError:
                row[read_header] = 0
            except KeyError as e:
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. process_cutoff_data'
                    .format(row['sid'], e.message))
Example #15
0
    def post_process_binary_variables(self, row, binary_variables):
        """Ensure all binary variables are actually 1 or 0.

        Binary variables Format:
            [list of binary variables]

        Args:
            row (dict): Row of VA data.
            binary_variables (list): Binary variable list.
        """
        for read_header in binary_variables:
            try:
                value = int(row[read_header])
            except ValueError:
                value = 0
            except KeyError as e:
                # Variable does not exist.
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. post_process_binary_variables'
                    .format(row['sid'], e.message))
                continue
            row[read_header] = int(value == 1)
    def calculate_duration_vars(self, row, duration_vars, special_case_vars):
        """Calculate duration variables in days.

        Args:
            row (dict): Row of VA data.
            duration_vars (list): Answers which contain duration variables.
            special_case_vars (dict): Dictionary of special variables and their value if duration is blank.
        """
        for var in duration_vars:
            code_var, length_var = '{}a'.format(var), '{}b'.format(var)
            try:
                code_value = value_or_default(row[code_var])
                length_value = value_or_default(row[length_var])
            except KeyError as e:
                # Variable does not exist.
                warning_logger.debug(
                    'SID: {} variable \'{}\' does not exist. calculate_duration_vars'
                    .format(row['sid'], e.message))
                continue

            if var in special_case_vars and row[length_var] == '':
                row[var] = special_case_vars[var]
            else:
                row[var] = TIME_FACTORS.get(code_value, 0) * length_value
Example #17
0
    def run(self):
        status_logger.info('Preparing variable headers.')
        status_notifier.update({'progress': (0, 15), 'sub_progress': None})

        intermediate_dir = intermediate_dir_path(self.output_dir_path)
        figures_dir = os.path.join(self.output_dir_path, 'figures')

        self.make_dir(intermediate_dir_path(self.output_dir_path))

        try:
            self.format_headers(self.input_file_path, os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME))
        except StopIteration:
            # File doesn't contain data
            message = 'Source file "{}" does not contain data.'.format(self.input_file_path)
            self._complete(CompletionStatus.FAIL, message)
            warning_logger.warning(message)
            return

        report_logger.info('Analysis parameters:')
        report_logger.info('- Input file: {}'.format(self.input_file_path))
        report_logger.info('- Output folder: {}'.format(self.output_dir_path))
        report_logger.info('- Country: {}'.format(self.country))
        report_logger.info('- HIV Region: {}'.format(self.options.get('hiv', True)))
        report_logger.info('- Malaria Region: {}'.format(self.options.get('malaria', True)))
        report_logger.info('')

        file_path = os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME)
        who_questionnaire = self.who_questionaire_test(file_path)

        if who_questionnaire:
            self.short_form = True
            form_name = 'WHO 2016 Questionnaire'

        else:
            self.short_form = self.short_form_test(file_path)
            warning_logger.debug('Detected {} form'.format(
                'short' if self.short_form else 'standard'))
            if self.short_form:
                form_name = 'PHMRC Shortened Questionnaire'
            else:
                form_name = 'PHMRC Full Questionnaire'
        report_logger.info('Detected {}'.format(form_name))

        who_prep = WHOPrep(self.output_dir_path)
        common_prep = CommonPrep(self.output_dir_path, self.short_form)
        adult_pre_symptom = PreSymptomPrep(adult_pre_symptom_data, self.output_dir_path, self.short_form)
        adult_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.ADULT, ADULT_RULES)
        adult_symptom = SymptomPrep(adult_symptom_data, self.output_dir_path, self.short_form)
        adult_results = TariffPrep(adult_tariff_data, self.output_dir_path, self.short_form, self.options, self.country)
        child_pre_symptom = PreSymptomPrep(child_pre_symptom_data, self.output_dir_path, self.short_form)
        child_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.CHILD, CHILD_RULES)
        child_symptom = SymptomPrep(child_symptom_data, self.output_dir_path, self.short_form)
        child_results = TariffPrep(child_tariff_data, self.output_dir_path, self.short_form, self.options, self.country)
        neonate_pre_symptom = PreSymptomPrep(neonate_pre_symptom_data, self.output_dir_path, self.short_form)
        neonate_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.NEONATE, NEONATE_RULES)
        neonate_symptom = SymptomPrep(neonate_symptom_data, self.output_dir_path, self.short_form)
        neonate_results = TariffPrep(neonate_tariff_data, self.output_dir_path, self.short_form, self.options, self.country)
        legacy = self.options.get('legacy_format', False)
        output = OutputPrep(self.output_dir_path, reorganize=not legacy,
                            keep_orig=legacy, short_form=self.short_form,
                            free_text=self.options.get('free_text', True),
                            hce=self.options.get('hce', True))
        cause_grapher = CauseGrapher(self.output_dir_path)
        csmf_grapher = CSMFGrapher(self.output_dir_path)

        self._abort_list.extend([
            who_prep,
            common_prep,
            adult_pre_symptom,
            adult_rules,
            adult_symptom,
            adult_results,
            child_pre_symptom,
            child_rules,
            child_symptom,
            child_results,
            neonate_pre_symptom,
            neonate_rules,
            neonate_symptom,
            neonate_results,
            cause_grapher,
            csmf_grapher,
        ])

        try:
            if who_questionnaire:
                who_prep.run()

            # makes adult-prepped.csv, child-prepped.csv, neonate-prepped.csv
            adult_data, child_data, neonate_data = common_prep.run()

            if adult_data:
                # makes adult-presymptom.csv
                adult_pre_symptom.run()
                # makes adult-logic-rules.csv
                adult_rules.run()
                # makes adult-symptom.csv
                adult_symptom.run()
                # creates adult output files
                adult_results.run()

            if child_data:
                # makes child-presymptom.csv
                child_pre_symptom.run()
                # makes child-logic-rules.csv
                child_rules.run()
                # makes child-symptom.csv
                child_symptom.run()
                # creates child output files
                child_results.run()

            if neonate_data:
                # makes neonate-presymptom.csv
                neonate_pre_symptom.run()
                # makes neonate-logic-rules.csv
                neonate_rules.run()
                # makes neonate-symptom.csv
                neonate_symptom.run()
                # creates neonate output files
                neonate_results.run()

            if self.options.get('figures') and (adult_data or child_data or neonate_data):
                self.make_dir(figures_dir)
                # generate all cause graphs
                cause_grapher.run()
                # generate all csmf graphs
                csmf_grapher.run()

            output.run()

        except AbortException:
            self._complete(CompletionStatus.ABORT)
        except Exception:
            traceback.print_exc()
            self._complete(CompletionStatus.FAIL)
        else:
            self._complete(CompletionStatus.DONE)