def report_row(flag_where): """A helper method to return an english explanation of what rows have been flagged with a failed validation check. Args: flag_where (Pandas Series): boolean Pandas Series representing failed validation checks. Returns: str: a string reporting the index of the flagged rows Examples: >>> report_row(pd.Series([True, True, False, True, False])) '#0, #1, and #3' """ flagged = flag_where[flag_where.fillna(False)] if flagged.index.is_numeric(): unformatted_report = report_list(flagged.index.tolist(), paren=False) return re.sub(r"'(\d+)'", r"#\1", unformatted_report) return report_list(flagged.index.tolist(), paren=False)
def no_extraneous(self, given, relevant, value_type): """adds a validation check where all values in `given` should also be in `relevant` to pass. `fail_check` is `Warn` Args: given (Pandas Series): the items representing input given relevant (Pandas Series): all items in `given` that will be used value_type (str): string describing the kind of noun that is listed in `given` Returns: None Examples: >>> v = Validation() >>> v.no_extraneous(pd.Series(["a","b"], name="example input"), pd.Series(["a","b","c"], name="relevant value(s)"), "example") >>> v.no_extraneous(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","d"], name="relevant value(s)"), "example") >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] No extraneous example found in example input. <BLANKLINE> ERRORS [!] 2 extraneous example(s) found in example input ('b', and 'c') Extraneous example(s) will be ommitted. """ # comparison is true (fails) when an item in `given` isn't in # `relevant` comparison = ~given.isin(relevant) fail_msg = " ".join([ str(comparison.sum()), "extraneous", value_type + "(s)", "found in", str(given.name), report_list(given[comparison], limit=5), "Extraneous", value_type + "(s)", "will be ommitted." ]) passing_msg = " ".join( ["No extraneous", value_type, "found in", str(given.name) + "."]) self._add_condition(comparison, Passing(passing_msg), Err(fail_msg))
def must_contain(self, given, required, passing_msg="", fail=Err): """adds a validation check where `given` must contain every item in `required` at least once to pass, and `fail_check` is `fail`, (fails validation). Args: given (Pandas Series): the items representing input given required (Pandas Series): the items required to be in `given` passing_msg (str): Message to return if all items in `expected` are listed in `given`. Defaults to "". fail (VCheck): the outcome if the check fails. Default is Err. impact (Pandas Series): a corresponding series to `required` that represents the affected information when Returns: None Examples: >>> v = Validation() >>> v.must_contain(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","b"], name="example requirement(s)"), "all included") >>> v.must_contain(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","b","d"], name="example requirement(s)")) >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] all included <BLANKLINE> ERRORS [!] 1 (33.3%) example requirement(s) ('d') were not found in example input. Their values will be NA. """ # Comparison is true (fails) when an item in required isn't in given comparison = ~required.isin(given) percentage = '{0:.1%}'.format(comparison.sum() / comparison.size) fail_msg = " ".join([ str(comparison.sum()), '(' + percentage + ')', str(required.name), report_list(required[comparison]), "were not found in", str(given.name) + ".", "Their values will be NA." ]) self._add_condition(comparison, Passing(passing_msg), fail(fail_msg))
def all_valid(self, given, valid, definition): """adds a validation check where all values in `given` must be in `valid` to pass. `fail_check` is `Err` (fails validation). Args: given (Pandas Series): the items representing input given valid (Pandas Series): list of all possible valid items accepted in `given` definition (str): string describing what makes an item in `given` be in `valid` Returns: None Examples: >>> v = Validation() >>> v.all_valid(pd.Series(["a","b"], name="example input"), pd.Series(["a","b","c"], name="valid value(s)"), "pre-defined") >>> v.all_valid(pd.Series(["a","b","c"], name="example input"), pd.Series(["a","d"], name="valid value(s)"), "'a' or 'd'") >>> v.report(verbose=4) Validating . . . <BLANKLINE> CHECKS PASSED [X] All values in example input are valid. <BLANKLINE> ERRORS [!] 2 values in example input were invalid ('b', and 'c'). These must be 'a' or 'd' to be valid. """ # comparison is true (fails) when an item in `given` isn't in `valid` comparison = ~given.isin(valid) passing_msg = " ".join( ["All values in", str(given.name), "are valid."]) fail_msg = " ".join([ str((comparison).sum()), "values in", str(given.name), "were invalid", report_list(given[(comparison)]) + ".", "These must be", definition, "to be valid." ]) self._add_condition(comparison, Passing(passing_msg), Err(fail_msg))
def describe(self): """Prints the mapping relationships in the Configuration object to console. Args: None Returns: None Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv") >>> Configuration(EX_MAP_1).describe() MAPPING STATS <BLANKLINE> - 16 new columns produced ('AB_POSIT', 'AB_SIZE', 'AC_BRL', 'AC_CONV', 'AC_COUGH', etc) - 12 source columns required ('Id10403', 'Id10362', 'Id10169', 'Id10221', 'Id10154', etc) - 7 relationships invoked ('eq', 'lt', 'between', 'ge', 'contains', etc) - 13 conditions listed ('yes', '14', '10', '21', '15 to 49', etc) - 1 prerequisites checked ('FEMALE') """ print("MAPPING STATS\n") spacer = " - \t" unique_checks = [("New Column Name", "new columns produced"), ("Source Column ID", "source columns required"), ("Relationship", "relationships invoked"), ("Condition", "conditions listed"), ("Prerequisite", "prerequisites checked")] for col_name, context in unique_checks: print( spacer, self.config_data[col_name].nunique(), context, report_list(self.config_data[col_name].dropna().unique(), limit=5))
def flag_elements(self, flag_where, flag_elements, criteria): """Adds a validation check seeing if any values in flag_where are true, and then reports on the corresponding items in flag_elements. Args: flag_where (Pandas Series): a boolean Pandas Series where True represents a failed check flag_elements (Pandas Series): a boolean Pandas Series listing elements that are affected by True values in `flag_where` criteria (String): a brief description of what elements are being flagged and reported on Returns: None Examples: >>> v = Validation("element test") >>> v.flag_elements(pd.Series([False, False]), pd.Series(["A", "B"]), "red flag(s)") >>> v.flag_elements(pd.Series([False, True]), pd.Series(["A", "B"]), "blue flag(s)") >>> v.report(verbose=4) Validating element test . . . <BLANKLINE> CHECKS PASSED [X] No red flag(s) in element test detected. <BLANKLINE> WARNINGS [?] 1 blue flag(s) in element test detected. These ('B') will be treated as NA. """ passing_msg = f"No {criteria} in {self.name} detected." fail_msg = " ".join([ str(flag_where.sum()), criteria, "in", self.name, "detected.", "These", report_list(flag_elements[flag_where]), "will be treated as NA." ]) self._add_condition(flag_where, Passing(passing_msg), Warn(fail_msg))
def validate(self, verbose=None): """Prepares and validates the Configuration object's mapping conditions. Validation fails if there are any inoperable errors. Problems that can be fixed in place are processed and flagged as warnings. Args: verbose (int): controls print output, should be in range 0-5, each higher level includes the messages of each level below it. Where verbose = 0, nothing will be printed to console. Where verbose = 1, print only errors to console, where verbose = 2, also print warnings, where verbose = 3, also print suggestions and status checks, where verbose = 4, also print passing validation checks, where verbose = 5, also print description of configuration conditions. Defaults to None; if none, replace with self.verbose attribute Returns: Boolean: boolean representing whether there are any errors that prevent validation Examples: >>> MAP_PATH = "resources/mapping_configuration_files/" >>> EX_MAP_2 = pd.read_csv(MAP_PATH + "example_config_2.csv") >>> c = Configuration(EX_MAP_2) >>> c.validate(verbose=4) Validating Mapping Configuration . . . <BLANKLINE> CHECKS PASSED [X] All expected columns ('New Column Name', 'New Column Documentation', 'Source Column ID', 'Source Column Documentation', 'Relationship', 'Condition', and 'Prerequisite') accounted for in configuration file. [X] No leading/trailing spaces column New Column Name detected. [X] No leading/trailing spaces column Relationship detected. [X] No leading/trailing spaces column Prerequisite detected. [X] No leading/trailing spaces column Condition detected. [X] No whitespace in column Condition detected. [X] No upper case value(s) in column Relationship detected. [X] No upper case value(s) in column Condition detected. [X] No non-alphanumeric value(s) in column Source Column ID detected. [X] No non-alphanumeric value(s) in column Relationship detected. [X] No non-alphanumeric value(s) in column Condition detected. [X] No new column(s) listed but not defined in Mapping Configuration detected. [X] No NA's in column New Column Name detected. [X] No NA's in column Source Column ID detected. <BLANKLINE> ERRORS [!] 3 values in Relationship column were invalid ('eqqqq', 'another fake', and 'gee'). These must be a valid method of pd.Series, e.g. ('gt', 'ge', 'lt', 'le', 'between', 'eq', 'ne', and 'contains') to be valid. [!] 2 row(s) containing a numerical relationship with non-number condition detected in row(s) #8, and #9. [!] 2 values in Prerequisite column were invalid ('ABDOMM', and 'Placeholder here'). These must be defined in the 'new column name' column of the config file to be valid. <BLANKLINE> WARNINGS [?] 2 whitespace in column New Column Name detected in row(s) #6, and #8. Whitespace will be converted to '_' [?] 1 whitespace in column Relationship detected in row(s) #4. Whitespace will be converted to '_' [?] 1 whitespace in column Prerequisite detected in row(s) #9. Whitespace will be converted to '_' [?] 1 non-alphanumeric value(s) in column New Column Name detected in row(s) #6. This text should be alphanumeric. Non-alphanumeric characters will be removed. [?] 2 duplicate row(s) detected in row(s) #1, and #14. Duplicates will be dropped. [?] 1 NA's in column Relationship detected in row(s) #3. [?] 1 NA's in column Condition detected in row(s) #6. False """ if verbose is None: verbose = self.verbose # Check that all expected columns accounted for col_passing_msg = " ".join([ "All expected columns", report_list(self.required_columns), "accounted for in configuration file." ]) self.validation.must_contain(self.given_columns, self.required_columns, passing_msg=col_passing_msg) # reindex - any missing columns become filled with NA self.config_data = self.config_data.reindex( columns=self.required_columns) # Drop any rows that are entirely blank without warnings self.config_data = self.config_data.dropna(how="all") # Processing strings # columns that should contain no whitespace ws_col = ["New Column Name", "Relationship", "Prerequisite"] lowercase_col = ["Relationship"] # columns that should be lowercase if self.process_strings: ws_col.append("Condition") lowercase_col.append("Condition") self.config_data.fillna("na", inplace=True) # fill NAs for str ops # Remove whitespace self.config_data.loc[:, ws_col] = self.validation.fix_whitespace( self.config_data.loc[:, ws_col]) # Check for uppercase characters self.config_data.loc[:, lowercase_col] = self.validation.fix_upcase( self.config_data.loc[:, lowercase_col]) # Check that main columns contain only alphanumeric values self.config_data.loc[:, self.main_columns] = self.validation.fix_alnum( self.config_data.loc[:, self.main_columns]) # Check for missing values self.config_data = self.config_data.replace("na", np.nan) # Check for duplicate rows & drop them self.validation.flag_rows(self.config_data.duplicated(), flag_criteria="duplicate row(s)", flag_action="Duplicates will be dropped.") self.config_data = self.config_data.drop_duplicates() # Check and note if there are missing sources/conditions/rel # ie if we expect any of these sources to be absent defined_no_source = (np.all(self.config_data[[ "Source Column ID", "Relationship", "Condition" ]].isnull(), axis=1) & self.config_data["New Column Name"].notnull()) self.validation.flag_elements( defined_no_source, self.config_data["New Column Name"], criteria="new column(s) listed but not defined") self.config_data = self.config_data.loc[~defined_no_source, :] # Check & drop rows that contain any NAs in main columns self.validation.check_na(self.config_data[self.main_columns]) self.config_data = self.config_data.loc[ np.all(self.config_data[self.main_columns].notnull(), axis=1), :] # check all relationships in relationship column are valid self.validation.all_valid( self.given_relationships, self.valid_relationships, "a valid method of pd.Series, e.g. " + report_list(self.valid_relationships)) # check for non-number conditions with numerical relationships invalid_num = (self.config_data["Relationship"].isin( ["gt", "ge", "le", "lt"]) & (pd.to_numeric( self.config_data["Condition"], errors="coerce").isnull())) self.validation.flag_rows( invalid_num, flag_criteria="row(s) containing a numerical" + " relationship with non-number condition", flag_tier=Err) # check all prerequisite columns are also defined in configuration self.validation.all_valid( self.given_prereq, self.new_columns, "defined in the 'new column name' column " + "of the config file") self.validation.report(verbose=verbose) # report if verbose == 5: self.describe() # return true only if there are zero errors return self.validation.is_valid()