Exemple #1
0
def report_row(flag_where):
    """A helper method to return an english explanation of what rows have been
    flagged with a failed validation check.

    Args:
        flag_where (Pandas Series): boolean Pandas Series representing failed
            validation checks.

    Returns:
        str: a string reporting the index of the flagged rows

    Examples:
        >>> report_row(pd.Series([True, True, False, True, False]))
        '#0, #1, and #3'

    """
    flagged = flag_where[flag_where.fillna(False)]
    if flagged.index.is_numeric():
        unformatted_report = report_list(flagged.index.tolist(), paren=False)
        return re.sub(r"'(\d+)'", r"#\1", unformatted_report)
    return report_list(flagged.index.tolist(), paren=False)
Exemple #2
0
    def no_extraneous(self, given, relevant, value_type):
        """adds a validation check where all values in `given` should also be
        in `relevant` to pass. `fail_check` is `Warn`

        Args:
            given (Pandas Series): the items representing input given
            relevant (Pandas Series): all items in `given` that will be used
            value_type (str): string describing the kind of noun that is
                listed in `given`

        Returns:
            None

        Examples:
            >>> v = Validation()
            >>> v.no_extraneous(pd.Series(["a","b"], name="example input"),  pd.Series(["a","b","c"],  name="relevant value(s)"), "example")
            >>> v.no_extraneous(pd.Series(["a","b","c"], name="example input"),  pd.Series(["a","d"],  name="relevant value(s)"), "example")
            >>> v.report(verbose=4)
            Validating  . . .
            <BLANKLINE>
            CHECKS PASSED
            [X]      No extraneous example found in example input.
            <BLANKLINE>
            ERRORS
            [!]      2 extraneous example(s) found in example input
            ('b', and 'c') Extraneous example(s) will be ommitted.
            """
        # comparison is true (fails) when an  item in `given` isn't in
        # `relevant`
        comparison = ~given.isin(relevant)
        fail_msg = " ".join([
            str(comparison.sum()), "extraneous", value_type + "(s)",
            "found in",
            str(given.name),
            report_list(given[comparison], limit=5), "Extraneous",
            value_type + "(s)", "will be ommitted."
        ])
        passing_msg = " ".join(
            ["No extraneous", value_type, "found in",
             str(given.name) + "."])
        self._add_condition(comparison, Passing(passing_msg), Err(fail_msg))
Exemple #3
0
    def must_contain(self, given, required, passing_msg="", fail=Err):
        """adds a validation check where `given` must contain every item in
        `required` at least once to pass, and `fail_check` is `fail`,
        (fails validation).

        Args:
            given (Pandas Series): the items representing input given
            required (Pandas Series): the items required to be in `given`
            passing_msg (str): Message to return if all items in `expected` are
                listed in `given`. Defaults to "".
            fail (VCheck): the outcome if the check fails. Default is Err.
            impact (Pandas Series): a corresponding series to `required` that
                represents the affected information when

        Returns:
            None

        Examples:
            >>> v = Validation()
            >>> v.must_contain(pd.Series(["a","b","c"], name="example input"),  pd.Series(["a","b"],  name="example requirement(s)"),  "all included")
            >>> v.must_contain(pd.Series(["a","b","c"], name="example input"),  pd.Series(["a","b","d"],  name="example requirement(s)"))
            >>> v.report(verbose=4)
            Validating  . . .
            <BLANKLINE>
             CHECKS PASSED
            [X]          all included
            <BLANKLINE>
             ERRORS
             [!]          1 (33.3%) example requirement(s) ('d') were not found in example input. Their values will be NA.
            """
        # Comparison is true (fails) when an item in required isn't in given
        comparison = ~required.isin(given)
        percentage = '{0:.1%}'.format(comparison.sum() / comparison.size)
        fail_msg = " ".join([
            str(comparison.sum()), '(' + percentage + ')',
            str(required.name),
            report_list(required[comparison]), "were not found in",
            str(given.name) + ".", "Their values will be NA."
        ])
        self._add_condition(comparison, Passing(passing_msg), fail(fail_msg))
Exemple #4
0
    def all_valid(self, given, valid, definition):
        """adds a validation check where all values in `given` must be in `valid`
        to pass. `fail_check` is `Err` (fails validation).

        Args:
            given (Pandas Series): the items representing input given
            valid (Pandas Series): list of all possible valid items accepted in
                `given`
            definition (str): string describing what makes an item in `given`
                be in `valid`

        Returns:
            None

        Examples:
            >>> v = Validation()
            >>> v.all_valid(pd.Series(["a","b"], name="example input"),  pd.Series(["a","b","c"],  name="valid value(s)"), "pre-defined")
            >>> v.all_valid(pd.Series(["a","b","c"], name="example input"),  pd.Series(["a","d"],  name="valid value(s)"), "'a' or 'd'")
            >>> v.report(verbose=4)
            Validating  . . .
            <BLANKLINE>
             CHECKS PASSED
            [X]          All values in example input are valid.
            <BLANKLINE>
             ERRORS
            [!]          2 values in example input were invalid ('b', and 'c').
            These must be 'a' or 'd' to be valid.
                    """
        # comparison is true (fails) when an item in `given` isn't in `valid`
        comparison = ~given.isin(valid)
        passing_msg = " ".join(
            ["All values in", str(given.name), "are valid."])
        fail_msg = " ".join([
            str((comparison).sum()), "values in",
            str(given.name), "were invalid",
            report_list(given[(comparison)]) + ".", "These must be",
            definition, "to be valid."
        ])
        self._add_condition(comparison, Passing(passing_msg), Err(fail_msg))
Exemple #5
0
    def describe(self):
        """Prints the mapping relationships in the Configuration object to
        console.

        Args:
            None

        Returns:
            None

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_1 = pd.read_csv(MAP_PATH + "example_config_1.csv")
            >>> Configuration(EX_MAP_1).describe()
            MAPPING STATS
            <BLANKLINE>
             -   16 new columns produced ('AB_POSIT', 'AB_SIZE', 'AC_BRL', 'AC_CONV', 'AC_COUGH', etc)
             -   12 source columns required ('Id10403', 'Id10362', 'Id10169', 'Id10221', 'Id10154', etc)
             -   7 relationships invoked ('eq', 'lt', 'between', 'ge', 'contains', etc)
             -   13 conditions listed ('yes', '14', '10', '21', '15 to 49', etc)
             -   1 prerequisites checked ('FEMALE')

        """
        print("MAPPING STATS\n")
        spacer = " - \t"

        unique_checks = [("New Column Name", "new columns produced"),
                         ("Source Column ID", "source columns required"),
                         ("Relationship", "relationships invoked"),
                         ("Condition", "conditions listed"),
                         ("Prerequisite", "prerequisites checked")]

        for col_name, context in unique_checks:
            print(
                spacer, self.config_data[col_name].nunique(), context,
                report_list(self.config_data[col_name].dropna().unique(),
                            limit=5))
Exemple #6
0
    def flag_elements(self, flag_where, flag_elements, criteria):
        """Adds a validation check seeing if any values in flag_where are true,
        and then reports on the corresponding items in flag_elements.

        Args:
            flag_where (Pandas Series): a boolean Pandas Series where True
                represents a failed check
            flag_elements (Pandas Series): a boolean Pandas Series listing
                elements that are affected by True values in `flag_where`
            criteria (String): a brief description of what elements are
                being flagged and reported on

        Returns:
            None

        Examples:
            >>> v = Validation("element test")
            >>> v.flag_elements(pd.Series([False, False]),  pd.Series(["A", "B"]), "red flag(s)")
            >>> v.flag_elements(pd.Series([False, True]),  pd.Series(["A", "B"]), "blue flag(s)")
            >>> v.report(verbose=4)
            Validating element test . . .
            <BLANKLINE>
             CHECKS PASSED
            [X]          No red flag(s) in element test detected.
            <BLANKLINE>
             WARNINGS
            [?]          1 blue flag(s) in element test detected. These ('B') will be treated as NA.
        """
        passing_msg = f"No {criteria} in {self.name} detected."
        fail_msg = " ".join([
            str(flag_where.sum()), criteria, "in", self.name, "detected.",
            "These",
            report_list(flag_elements[flag_where]), "will be treated as NA."
        ])

        self._add_condition(flag_where, Passing(passing_msg), Warn(fail_msg))
Exemple #7
0
    def validate(self, verbose=None):
        """Prepares and validates the Configuration object's mapping conditions.
        Validation fails if there are any inoperable errors. Problems that can
        be fixed in place are processed and flagged as warnings.

        Args:
            verbose (int): controls print output, should be in range 0-5,
                each higher level includes the messages of each level below it.
                Where verbose = 0, nothing will be printed to console.
                Where verbose = 1, print only errors to console,
                where verbose = 2, also print warnings,
                where verbose = 3, also print suggestions and status checks,
                where verbose = 4, also print passing validation checks,
                where verbose = 5, also print description of configuration
                conditions.
                Defaults to None; if none, replace with self.verbose attribute
        Returns:
            Boolean: boolean representing whether there are any errors that
                prevent validation

        Examples:
            >>> MAP_PATH = "resources/mapping_configuration_files/"
            >>> EX_MAP_2 = pd.read_csv(MAP_PATH + "example_config_2.csv")
            >>> c = Configuration(EX_MAP_2)
            >>> c.validate(verbose=4)
            Validating Mapping Configuration . . .
            <BLANKLINE>
             CHECKS PASSED
            [X]          All expected columns ('New Column Name', 'New Column Documentation', 'Source Column ID', 'Source Column Documentation', 'Relationship', 'Condition', and 'Prerequisite') accounted for in configuration file.
            [X]          No leading/trailing spaces column New Column Name detected.
            [X]          No leading/trailing spaces column Relationship detected.
            [X]          No leading/trailing spaces column Prerequisite detected.
            [X]          No leading/trailing spaces column Condition detected.
            [X]          No whitespace in column Condition detected.
            [X]          No upper case value(s) in column Relationship detected.
            [X]          No upper case value(s) in column Condition detected.
            [X]          No non-alphanumeric value(s) in column Source Column ID detected.
            [X]          No non-alphanumeric value(s) in column Relationship detected.
            [X]          No non-alphanumeric value(s) in column Condition detected.
            [X]          No new column(s) listed but not defined in Mapping Configuration detected.
            [X]          No NA's in column New Column Name detected.
            [X]          No NA's in column Source Column ID detected.
            <BLANKLINE>
             ERRORS
            [!]          3 values in Relationship column were invalid ('eqqqq', 'another fake', and 'gee'). These must be a valid method of pd.Series, e.g. ('gt', 'ge', 'lt', 'le', 'between', 'eq', 'ne', and 'contains') to be valid.
            [!]          2 row(s) containing a numerical relationship with non-number condition detected in row(s) #8, and #9.
            [!]          2 values in Prerequisite column were invalid ('ABDOMM', and 'Placeholder here'). These must be defined in the 'new column name' column of the config file to be valid.
            <BLANKLINE>
             WARNINGS
            [?]          2 whitespace in column New Column Name detected in row(s) #6, and #8. Whitespace will be converted to '_'
            [?]          1 whitespace in column Relationship detected in row(s) #4. Whitespace will be converted to '_'
            [?]          1 whitespace in column Prerequisite detected in row(s) #9. Whitespace will be converted to '_'
            [?]          1 non-alphanumeric value(s) in column New Column Name detected in row(s) #6. This text should be alphanumeric. Non-alphanumeric characters will be removed.
            [?]          2 duplicate row(s) detected in row(s) #1, and #14. Duplicates will be dropped.
            [?]          1 NA's in column Relationship detected in row(s) #3.
            [?]          1 NA's in column Condition detected in row(s) #6.
            False
        """
        if verbose is None:
            verbose = self.verbose

        # Check that all expected columns accounted for
        col_passing_msg = " ".join([
            "All expected columns",
            report_list(self.required_columns),
            "accounted for in configuration file."
        ])
        self.validation.must_contain(self.given_columns,
                                     self.required_columns,
                                     passing_msg=col_passing_msg)

        # reindex - any missing columns become filled with NA
        self.config_data = self.config_data.reindex(
            columns=self.required_columns)

        # Drop any rows that are entirely blank without warnings
        self.config_data = self.config_data.dropna(how="all")

        # Processing strings
        # columns that should contain no whitespace
        ws_col = ["New Column Name", "Relationship", "Prerequisite"]
        lowercase_col = ["Relationship"]  # columns that should be lowercase
        if self.process_strings:
            ws_col.append("Condition")
            lowercase_col.append("Condition")
        self.config_data.fillna("na", inplace=True)  # fill NAs for str ops

        # Remove whitespace
        self.config_data.loc[:, ws_col] = self.validation.fix_whitespace(
            self.config_data.loc[:, ws_col])

        # Check for uppercase characters
        self.config_data.loc[:, lowercase_col] = self.validation.fix_upcase(
            self.config_data.loc[:, lowercase_col])

        # Check that main columns contain only alphanumeric values
        self.config_data.loc[:, self.main_columns] = self.validation.fix_alnum(
            self.config_data.loc[:, self.main_columns])

        # Check for missing values
        self.config_data = self.config_data.replace("na", np.nan)

        # Check for duplicate rows & drop them
        self.validation.flag_rows(self.config_data.duplicated(),
                                  flag_criteria="duplicate row(s)",
                                  flag_action="Duplicates will be dropped.")
        self.config_data = self.config_data.drop_duplicates()

        # Check and note if there are missing sources/conditions/rel
        # ie if we expect any of these sources to be absent
        defined_no_source = (np.all(self.config_data[[
            "Source Column ID", "Relationship", "Condition"
        ]].isnull(),
                                    axis=1)
                             & self.config_data["New Column Name"].notnull())

        self.validation.flag_elements(
            defined_no_source,
            self.config_data["New Column Name"],
            criteria="new column(s) listed but not defined")

        self.config_data = self.config_data.loc[~defined_no_source, :]

        # Check & drop rows that contain any NAs in main columns
        self.validation.check_na(self.config_data[self.main_columns])
        self.config_data = self.config_data.loc[
            np.all(self.config_data[self.main_columns].notnull(), axis=1), :]

        # check all relationships in relationship column are valid
        self.validation.all_valid(
            self.given_relationships, self.valid_relationships,
            "a valid method of pd.Series, e.g. " +
            report_list(self.valid_relationships))

        # check for non-number conditions with numerical relationships
        invalid_num = (self.config_data["Relationship"].isin(
            ["gt", "ge", "le", "lt"]) & (pd.to_numeric(
                self.config_data["Condition"], errors="coerce").isnull()))
        self.validation.flag_rows(
            invalid_num,
            flag_criteria="row(s) containing a numerical" +
            " relationship with non-number condition",
            flag_tier=Err)

        # check all prerequisite columns are also defined in configuration
        self.validation.all_valid(
            self.given_prereq, self.new_columns,
            "defined in the 'new column name' column " + "of the config file")

        self.validation.report(verbose=verbose)  # report
        if verbose == 5:
            self.describe()

        # return true only if there are zero errors
        return self.validation.is_valid()