def validate_date_vars(self, row, date_vars): """Try to get an approximate date by replacing invalid values with defaults. Args: row (dict): Row of VA data. date_vars (dict): Answers which contain date data. """ # Get an approximate date. # Add 'd' (day) 'm' (month) 'y' (years) to each var and process. date_invalid = { 'd': (['', '99', 99], 1), 'm': (['', '99', 99], 1), 'y': (['', '999', 999, '9999', 9999], 0), } for var in date_vars: for val, val_data in date_invalid.items(): var_name = var + val invalid_data, default = val_data try: if row[var_name] in invalid_data: row[var_name] = default except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. validate_date_vars' .format(row['sid'], e.message)) continue
def process_binary_vars(row, conversion_map): """ Convert multiple value answers into binary cells. :param row: Row of data. :param conversion_map: Data structure with header and binary variable mapping. """ for data_header, data_map in conversion_map: try: for value in map(int, str(row[data_header]).strip().split(' ')): if isinstance(data_map, dict): if value in data_map: row[data_map[value]] = 1 elif isinstance(data_map, list): row[data_header] = int(value in data_map) elif isinstance(data_map, str): row[data_header] = int( LdapNotationParser(data_map, get_cell(row), int).evaluate()) except ValueError: # No values to process or not an integer value (invalid). pass except ConversionError as e: warning_logger.debug(e.message) continue except KeyError as e: # Variable does not exist. The new published form does not contain all of the previous variables. warning_logger.debug( 'SID: {} variable \'{}\' does not exist. process_binary_vars' .format(row['sid'], e.message)) continue
def process_age_vars(self, row): """Calculate and store age in years, months, and days. Args: row (dict): Row of VA data. """ for age_var in AGE_VARS: try: years = value_or_default(row['{:s}a'.format(age_var)], float, [999, 9999]) months = value_or_default(row['{:s}b'.format(age_var)], float, 99) days = value_or_default(row['{:s}c'.format(age_var)], float, 99) row['{:s}a'.format(age_var)] = years + (months / 12.0) + (days / 365.0) row['{:s}b'.format(age_var)] = (12.0 * years) + months + (days / 30.0) row['{:s}c'.format(age_var)] = (365.0 * years) + (30.0 * months) + days except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. process_age_vars'. format(row['sid'], e.message)) continue
def process_progressive_value_data(row, progressive_data): """ Populate progressive variables from input data. Format: { 'read variable': [ (upper, variable), (median, variable), (lower, variable), (0, variable) ] } :param row: Row of data. :param progressive_data: Quartile ranges in specified format. """ for read_header, conversion_data in progressive_data: for value, write_header in conversion_data: try: if safe_float(row[read_header]) > value: if isinstance(write_header, tuple): write_header, write_value = write_header else: write_value = 1 row[write_header] = write_value break except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. process_progressive_value_data' .format(row['sid'], e.message)) continue
def convert_rash_data(self, row, conversion_data): """Specialized method to convert rash data into variables based on multiple choice questions. Split and store values from a space delimited list of integers in intermediate variables. If the three locations [1 (face), 2 (trunk), 3 (extremities)] values are specified, change answer to 4 (Everywhere). Conversion data format: { '#read_var': { 'vars': [quoted list of write vars], 'locations': [list of location values], 'everywhere': 4 } } Args: row (dict): Row of VA data. conversion_data (dict): Data structure with header and rash specific variable mapping. """ for variable, mapping in conversion_data.items(): try: rash_values = set(map(int, row[variable].split(' '))) except ValueError: # No rash data. Continue. continue except KeyError as e: # Variable does not exist. warning_logger.debug( 'SID: {} variable \'{}\' does not exist. convert_rash_data' .format(row['sid'], e.message)) continue else: locations = set(mapping['locations']) if rash_values.difference(mapping['values']): # Treat the entire field as invalid if any invalid values # are present rash_values = set() if mapping['everywhere'] in rash_values or locations.issubset( rash_values): # if all locations are selected, then change the value to 'everywhere' rash_values = {mapping['everywhere']} # FIXME: brittle/hacky handling of rash location # This happens to work for all the wrong reasons. We actually # are going to ignore all the values except those in the first # two listed variables (rashlocation3 doesn't map through to the # symptoms). We only care about the values 1 through 4. If 4 is # present it should be the only value. If 1,2,3 were selected it # should be 4 by now. Sorting ensures the subset of 1,2,3 if # present, are captured in the first two output variables. # Values of 5, 8, 9 may be listed at the end (rashlocation3 to # rashlocation5) and will be inadvertantly dropped. Yup. Hacky. for index, value in enumerate(sorted(rash_values)): row[mapping['vars'][index]] = value
def fix_rash_location(self, row): """Only rashes which are located on the face are relevant. Filter out other values. Args: row: Row of VA data. """ if self.AGE_GROUP == common_data.CHILD: try: for var in ['c4_31_1', 'c4_32']: row[var] = int('1' in str(row[var]).split()) except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. fix_rash_location' .format(row['sid'], e.message))
def fix_rash_length(self, row): """Only consider values in days for child rash data. Args: row: Row of VA data. """ try: if int(row['c4_33a']) != 4: row['c4_33b'] = 0 except ValueError: pass except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. fix_rash_length'. format(row['sid'], e.message))
def validate_weight_vars(self, row, weight_vars): """Replace invalid weight data with a default value. Args: row (dict): Row of VA data. weight_vars (list): Answers which contain weight data. """ for var in weight_vars: try: row[var] = value_or_default(row[var], int, [0, 9999], '') except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. validate_weight_vars' .format(row['sid'], e.message)) continue
def calculate_age_at_death_value(self, row): """Write age at death value to the appropriate variable. Args: row (dict): Row of VA data. """ if self.AGE_GROUP == common_data.NEONATE: try: value = value_or_default(row['c1_25b'], int, default=None) if 1 <= value <= 28: row['c1_26'] = 1 except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. calculate_age_at_death_value' .format(row['sid'], e.message)) elif self.AGE_GROUP == common_data.CHILD: row['c1_26'] = 2
def fill_missing_data(self, row, default_fill): """Fill missing data with default fill values. Args: row (dict): Row of VA data. default_fill (dict): Dictionary of headers and default values. """ for variable, value in default_fill.items(): try: if row[variable] == '': row[variable] = value except KeyError as e: # Variable does not exist. warning_logger.debug( 'SID: {} variable \'{}\' does not exist. fill_missing_data' .format(row['sid'], e.message)) continue
def fix_agedays(self, row): """Fix child agedays. If it's blank give it a 0, if it's not, give it a 4. Args: row (dict): Row of VA data. """ if self.AGE_GROUP in (common_data.CHILD, common_data.NEONATE): try: value = value_or_default(row['c1_25b'], int, default=None) if value is None: row['c1_25a'] = 0 else: row['c1_25a'] = 4 except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. fix_agedays'. format(row['sid'], e.message))
def convert_free_text_vars(self, row, data_headers, word_map): """Process all free text data from a list of data headers into binary variables. Args: row (dict): Row of VA data. data_headers (list): Answers which contain free text data. word_map (dict): Dictionary of words and variables. """ for data_header in data_headers: try: if row[data_header]: word_list = row[data_header].split(' ') self.convert_free_text_words(row, word_list, word_map) except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. convert_free_text_vars' .format(row['sid'], e.message)) continue
def copy_variables(self, row, copy_variables_map): """Copy data from one variable to another. Copy Variables Map Format: { 'read variable': 'write variable' } Args: row (dict): Row of VA data. copy_variables_map (dict): Read and write answer variables. """ for read_header, write_header in copy_variables_map.items(): try: row[write_header] = row[read_header] except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. copy_variables'. format(row['sid'], e.message))
def process_cutoff_data(self, row, cutoff_data_map): """Change read variable to 1/0 if value is greater/less or equal to cutoff, respectively. Cutoff data map Format: { 'read variable': cutoff value } Args: row (dict): Row of VA data. cutoff_data_map (dict): Cutoff data in specified format. """ for read_header, cutoff_data in cutoff_data_map: try: row[read_header] = int(float(row[read_header]) >= cutoff_data) except ValueError: row[read_header] = 0 except KeyError as e: warning_logger.debug( 'SID: {} variable \'{}\' does not exist. process_cutoff_data' .format(row['sid'], e.message))
def post_process_binary_variables(self, row, binary_variables): """Ensure all binary variables are actually 1 or 0. Binary variables Format: [list of binary variables] Args: row (dict): Row of VA data. binary_variables (list): Binary variable list. """ for read_header in binary_variables: try: value = int(row[read_header]) except ValueError: value = 0 except KeyError as e: # Variable does not exist. warning_logger.debug( 'SID: {} variable \'{}\' does not exist. post_process_binary_variables' .format(row['sid'], e.message)) continue row[read_header] = int(value == 1)
def calculate_duration_vars(self, row, duration_vars, special_case_vars): """Calculate duration variables in days. Args: row (dict): Row of VA data. duration_vars (list): Answers which contain duration variables. special_case_vars (dict): Dictionary of special variables and their value if duration is blank. """ for var in duration_vars: code_var, length_var = '{}a'.format(var), '{}b'.format(var) try: code_value = value_or_default(row[code_var]) length_value = value_or_default(row[length_var]) except KeyError as e: # Variable does not exist. warning_logger.debug( 'SID: {} variable \'{}\' does not exist. calculate_duration_vars' .format(row['sid'], e.message)) continue if var in special_case_vars and row[length_var] == '': row[var] = special_case_vars[var] else: row[var] = TIME_FACTORS.get(code_value, 0) * length_value
def run(self): status_logger.info('Preparing variable headers.') status_notifier.update({'progress': (0, 15), 'sub_progress': None}) intermediate_dir = intermediate_dir_path(self.output_dir_path) figures_dir = os.path.join(self.output_dir_path, 'figures') self.make_dir(intermediate_dir_path(self.output_dir_path)) try: self.format_headers(self.input_file_path, os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME)) except StopIteration: # File doesn't contain data message = 'Source file "{}" does not contain data.'.format(self.input_file_path) self._complete(CompletionStatus.FAIL, message) warning_logger.warning(message) return report_logger.info('Analysis parameters:') report_logger.info('- Input file: {}'.format(self.input_file_path)) report_logger.info('- Output folder: {}'.format(self.output_dir_path)) report_logger.info('- Country: {}'.format(self.country)) report_logger.info('- HIV Region: {}'.format(self.options.get('hiv', True))) report_logger.info('- Malaria Region: {}'.format(self.options.get('malaria', True))) report_logger.info('') file_path = os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME) who_questionnaire = self.who_questionaire_test(file_path) if who_questionnaire: self.short_form = True form_name = 'WHO 2016 Questionnaire' else: self.short_form = self.short_form_test(file_path) warning_logger.debug('Detected {} form'.format( 'short' if self.short_form else 'standard')) if self.short_form: form_name = 'PHMRC Shortened Questionnaire' else: form_name = 'PHMRC Full Questionnaire' report_logger.info('Detected {}'.format(form_name)) who_prep = WHOPrep(self.output_dir_path) common_prep = CommonPrep(self.output_dir_path, self.short_form) adult_pre_symptom = PreSymptomPrep(adult_pre_symptom_data, self.output_dir_path, self.short_form) adult_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.ADULT, ADULT_RULES) adult_symptom = SymptomPrep(adult_symptom_data, self.output_dir_path, self.short_form) adult_results = TariffPrep(adult_tariff_data, self.output_dir_path, self.short_form, self.options, self.country) child_pre_symptom = PreSymptomPrep(child_pre_symptom_data, self.output_dir_path, self.short_form) child_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.CHILD, CHILD_RULES) child_symptom = SymptomPrep(child_symptom_data, self.output_dir_path, self.short_form) child_results = TariffPrep(child_tariff_data, self.output_dir_path, self.short_form, self.options, self.country) neonate_pre_symptom = PreSymptomPrep(neonate_pre_symptom_data, self.output_dir_path, self.short_form) neonate_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.NEONATE, NEONATE_RULES) neonate_symptom = SymptomPrep(neonate_symptom_data, self.output_dir_path, self.short_form) neonate_results = TariffPrep(neonate_tariff_data, self.output_dir_path, self.short_form, self.options, self.country) legacy = self.options.get('legacy_format', False) output = OutputPrep(self.output_dir_path, reorganize=not legacy, keep_orig=legacy, short_form=self.short_form, free_text=self.options.get('free_text', True), hce=self.options.get('hce', True)) cause_grapher = CauseGrapher(self.output_dir_path) csmf_grapher = CSMFGrapher(self.output_dir_path) self._abort_list.extend([ who_prep, common_prep, adult_pre_symptom, adult_rules, adult_symptom, adult_results, child_pre_symptom, child_rules, child_symptom, child_results, neonate_pre_symptom, neonate_rules, neonate_symptom, neonate_results, cause_grapher, csmf_grapher, ]) try: if who_questionnaire: who_prep.run() # makes adult-prepped.csv, child-prepped.csv, neonate-prepped.csv adult_data, child_data, neonate_data = common_prep.run() if adult_data: # makes adult-presymptom.csv adult_pre_symptom.run() # makes adult-logic-rules.csv adult_rules.run() # makes adult-symptom.csv adult_symptom.run() # creates adult output files adult_results.run() if child_data: # makes child-presymptom.csv child_pre_symptom.run() # makes child-logic-rules.csv child_rules.run() # makes child-symptom.csv child_symptom.run() # creates child output files child_results.run() if neonate_data: # makes neonate-presymptom.csv neonate_pre_symptom.run() # makes neonate-logic-rules.csv neonate_rules.run() # makes neonate-symptom.csv neonate_symptom.run() # creates neonate output files neonate_results.run() if self.options.get('figures') and (adult_data or child_data or neonate_data): self.make_dir(figures_dir) # generate all cause graphs cause_grapher.run() # generate all csmf graphs csmf_grapher.run() output.run() except AbortException: self._complete(CompletionStatus.ABORT) except Exception: traceback.print_exc() self._complete(CompletionStatus.FAIL) else: self._complete(CompletionStatus.DONE)