def test_text_regex(name: str, regex_text: str, test_expected_list: List[Tuple[str, List[str]]], verbose: bool = False) -> None: """ Test a regex upon some text. Args: name: regex name (for display purposes only) regex_text: text that should be compiled to give our regex test_expected_list: list of tuples ``teststring, expected_results``, where ``teststring`` is some text and ``expected_results`` is a list of expected hits for the regex within ``teststring`` verbose: be verbose? Returns: """ log.info(f"Testing regex named {name}") compiled_regex = compile_regex(regex_text) if verbose: log.debug(f"... regex text:\n{regex_text}") for test_string, expected_values in test_expected_list: actual_values = get_compiled_regex_results(compiled_regex, test_string) assert actual_values == expected_values, ( "Regex {name}: Expected {expected_values}, got {actual_values}, " "when parsing {test_string}. Regex text:\n{regex_text}]".format( name=name, expected_values=expected_values, actual_values=actual_values, test_string=repr(test_string), regex_text=regex_text, )) log.info("... OK")
def __init__(self, nlpdef: NlpDefinition, cfgsection: str, regex_str: str, variable: str, target_unit: str, units_to_factor: Dict[typing.re.Pattern, float], take_absolute: bool = False, commit: bool = False, debug: bool = False) -> None: """ This class operates with compiled regexes having this group format: - variable - tense_indicator - relation - value - units units_to_factor: dictionary, mapping FROM (compiled regex for units) TO EITHER - float [multiple] to multiple those units by, to get preferred unit OR - function taking text parameter and returning float value in preferred unit - any units present in the regex but absent from units_to_factor will lead the result to be ignored -- for example, allowing you to ignore a relative neutrophil count ("neutrophils 2.2%") while detecting absolute neutrophil counts ("neutrophils 2.2"), or ignoring "docusate sodium 100mg" but detecting "sodium 140 mM". take_absolute: converts negative values to positive ones. Typical text requiring this might look like: CRP-4 CRP-106 CRP -97 Blood results for today as follows: Na- 142, K-4.1, ... ... occurring in 23 / 8054 for CRP of one test set in our data. For many quantities, we know that they cannot be negative, so this is just a notation rather than a minus sign. We have to account for it, or it'll distort our values. Preferable to account for it here rather than later; see manual. """ super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, variable=variable, target_unit=target_unit, regex_str_for_debugging=regex_str, commit=commit) if debug: print("Regex for {}: {}".format(type(self).__name__, regex_str)) self.compiled_regex = compile_regex(regex_str) self.units_to_factor = compile_regex_dict(units_to_factor) self.take_absolute = take_absolute
def learning_alternative_regex_groups(): regex_str = r""" ( (?: \s* (?: (a) | (b) | (c) | (d) ) \s* )* ( fish )? ) """ compiled_regex = compile_regex(regex_str) for test_str in ("a", "b", "a c", "d", "e", "a fish", "c c c"): m = compiled_regex.match(test_str) print("Match: {}; groups: {}".format(m, m.groups())) """
def test_text_regex(name: str, regex_text: str, test_expected_list: List[Tuple[str, List[str]]], verbose: bool = False) -> None: print("Testing regex named {}".format(name)) compiled_regex = compile_regex(regex_text) if verbose: print("... regex text:\n{}".format(regex_text)) for test_string, expected_values in test_expected_list: actual_values = get_compiled_regex_results(compiled_regex, test_string) assert actual_values == expected_values, ( "Regex {name}: Expected {expected_values}, got {actual_values}, " "when parsing {test_string}. Regex text:\n{regex_text}]".format( name=name, expected_values=expected_values, actual_values=actual_values, test_string=repr(test_string), regex_text=regex_text, )) print("... OK")
def __init__(self, nlpdef: NlpDefinition, cfgsection: str, regex_str_list: List[str], validated_variable: str, commit: bool = False) -> None: """ This class operates with compiled regexes having this group format: - variable """ super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit) self.regex_str_list = regex_str_list # for debugging only self.compiled_regex_list = [compile_regex(r) for r in regex_str_list] self.variable = "{}_validator".format(validated_variable) self.NAME = self.variable if nlpdef is None: # only None for debugging! self.tablename = '' else: self.tablename = nlpdef.opt_str( cfgsection, 'desttable', required=True)
def __init__(self, nlpdef: NlpDefinition, cfgsection: str, variable_name: str, # e.g. "MMSE" variable_regex_str: str, # e.g. regex for MMSE expected_denominator: int, numerator_text_fieldname: str = "numerator_text", numerator_fieldname: str = "numerator", denominator_text_fieldname: str = "denominator_text", denominator_fieldname: str = "denominator", correct_numerator_fieldname: str = None, # default below take_absolute: bool = True, commit: bool = False, debug: bool = False) -> None: """ This class operates with compiled regexes having this group format: - quantity_regex_str: e.g. to find "MMSE" """ self.variable_name = variable_name assert(expected_denominator > 0) self.expected_denominator = expected_denominator self.numerator_text_fieldname = numerator_text_fieldname self.numerator_fieldname = numerator_fieldname self.denominator_text_fieldname = denominator_text_fieldname self.denominator_fieldname = denominator_fieldname self.correct_numerator_fieldname = ( correct_numerator_fieldname or "out_of_{}".format(expected_denominator)) self.take_absolute = take_absolute super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit) if nlpdef is None: # only None for debugging! self.tablename = '' else: self.tablename = nlpdef.opt_str( cfgsection, 'desttable', required=True) regex_str = r""" ( {variable} ) # 1. group for variable (thing being measured) {OPTIONAL_RESULTS_IGNORABLES} {SCORE}? # optional "score" or similar {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # 2. optional group for tense indicator {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # 3. optional group for relation {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # 4. group for numerator (?: # optional "/ denominator" \s* {OUT_OF_SEPARATOR} \s* ( {UNSIGNED_INTEGER} ) # 5. group for denominator )? """.format( # noqa variable=variable_regex_str, OPTIONAL_RESULTS_IGNORABLES=OPTIONAL_RESULTS_IGNORABLES, SCORE=SCORE, TENSE_INDICATOR=TENSE_INDICATOR, RELATION=RELATION, SIGNED_FLOAT=SIGNED_FLOAT, OUT_OF_SEPARATOR=OUT_OF_SEPARATOR, UNSIGNED_INTEGER=UNSIGNED_INTEGER, ) if debug: print("Regex for {}: {}".format(type(self).__name__, regex_str)) self.regex_str = regex_str self.compiled_regex = compile_regex(regex_str)