def html_similarity(html_arg: html_arg_dict) -> BadResult: """Determine similarity of HTML arguments.""" sim = lev_similarity(html_arg['ng_html'].prettify(), html_arg['legacy_html'].prettify()) if sim < 0.69: return BadResult(html_arg['id'], "html_similarity", f"html_pretty_sim for {html_arg['id']} = {sim}") return None
def text_similarity(text_arg: text_arg_dict) -> BadResult: """Compute Levenshtein similarity of text.""" sim = lev_similarity(text_arg['ng_text'], text_arg['legacy_text']) if sim < 0.74: res = f"text_sim for {text_arg['paper_id']} = {sim}" return BadResult(text_arg['paper_id'], 'text_similarity', res) else: return None
def compare_status(res_arg: res_arg_dict) -> BadResult: """Compare HTTP status codes from responses.""" if res_arg['ng_res'].status_code == 200 and res_arg[ 'legacy_res'].status_code == 200: return None else: res = f'HTTP status for {res_arg["ng_url"]} was {res_arg["ng_res"].status_code} ' \ f'and for {res_arg["legacy_url"]} was {res_arg["legacy_res"].status_code}' return BadResult(res_arg['paper_id'], 'compare_status', res)
def metadata_fields_similarity(html_arg: html_arg_dict) -> BadResult: """ Determine similarity of metadata fields. There should be a div.metatable. It may have others and if it does they must be on both NG an legacy pages. """ ng_trs = html_arg['ng_html'].find('div', 'metatable').find_all('tr') legacy_trs = html_arg['legacy_html'].find('div', 'metatable').find_all('tr') def to_label(tr): return tr.find('td', 'label').contents ng_labels = set(ng_trs.map(to_label)) legacy_labels = set(legacy_trs.map(to_label)) if ng_labels == legacy_labels: return None else: return BadResult( html_arg['id'], "Metadata field included on NG do not match those from legacy" + f"NG: {ng_labels} Legacy: {legacy_labels}")
def protected(res_args: Dict) -> BadResult: # noinspection PyBroadException try: return fn(res_args) except Exception as ex: return BadResult(res_args['id'], "name unknown", traceback.format_exc())
def _element_similarity(name: str, get_element: Callable[[BeautifulSoup], BeautifulSoup], min_sim: float, required: bool, check_counts: bool, text_trans: Callable[[str], str], html_arg: html_arg_dict) -> BadResult: """ Perform element similarity. Uses get_element to select an element of the BS doc on both NG and Legacy do a similarity. required: element must be in both NG and Legacy. check_counts: counts of elements must be the same in both NG and Legacy, could be 0. """ legacy = get_element(html_arg['legacy_html']) ng = get_element(html_arg['ng_html']) if required: if len(ng) == 0 and len(legacy) == 0: return BadResult( html_arg['id'], name, f"Missing field {name} for {html_arg['id']} from NG and Legacy" ) if len(ng) == 0: return BadResult( html_arg['id'], name, f"Missing field {name} for {html_arg['id']} from NG") if len(legacy) == 0: return BadResult( html_arg['id'], name, f"Missing field {name} for {html_arg['id']} from legacy") if check_counts and (len(legacy) != len(ng)): if ng: ng_ele_txt = ng[0].prettify() else: ng_ele_txt = 'MISSING' if legacy: legacy_ele_txt = legacy[0].prettify() else: legacy_ele_txt = 'MISSING' return BadResult( html_arg['id'], name, f"bad counts for {name} for {html_arg['id']} ng: {len(ng)} legacy: {len(legacy)}", legacy_ele_txt, ng_ele_txt) ng_ele_txt = '' legacy_ele_txt = '' if len(ng) > 0 and len(legacy) > 0: ng_ele_txt = text_trans(ng[0].prettify()) legacy_ele_txt = text_trans(legacy[0].prettify()) sim = lev_similarity(ng_ele_txt, legacy_ele_txt) if sim < min_sim: msg = f"Elements did not meet min similarity of {min_sim}" return BadResult(html_arg['id'], name, msg, legacy_ele_txt, ng_ele_txt, sim) msg = f"GOOD: Elements did meet min similarity of {min_sim}" return BadResult(html_arg['id'], name, msg, '', '', sim) else: if not required: return None if len(ng) > 0: ng_ele_txt = ng[0].prettify() if len(legacy) > 0: legacy_ele_txt = legacy[0].prettify() msg = 'zero elements detected: ' \ + f'legacy length was {len(legacy)}; ng length was {len(ng)} ' return BadResult(html_arg['id'], name, msg, legacy_ele_txt, ng_ele_txt, 0.0)
def call_it(fn: Callable[[html_arg_dict], BadResult]) -> BadResult: # noinspection PyBroadException try: return fn(html_args) except Exception as ex: return BadResult(html_args['paper_id'], 'run_compare_html', traceback.format_exc())
def call_it(fn: Callable[[text_arg_dict], BadResult]) -> BadResult: # noinspection PyBroadException try: return fn(text_dict) except Exception as ex: return BadResult(res_args['paper_id'], 'run_compare_response', traceback.format_exc())