def __eq__(self, o: object) -> bool: """Compares the HOCRNode to another object The problem with comparing HTML is that minor differences in markup still represent the same tree with the same elements. lxml has a utility meant to make output checking in doctests more readable by comparing the functional equivalency. Read here: https://lxml.de/lxmlhtml.html#running-html-doctests Though this isn't a doctest, this functionality is essentially what is needed to compare two nodes. The comparator lives in lxml.doctestcompare.LHTMLOutputChecker, which is used with the PARSE_HTML optionflag. The following is considered functionally equivalent by the output checker and will therefore evaluate as true: - Different order of attributes - Repeated spaces inside a tag - Whitespace between tags """ if not isinstance(o, HOCRNode): return False checker = LHTMLOutputChecker() return checker.check_output( want=lxml.etree.tostring(self), got=lxml.etree.tostring(o), optionflags=PARSE_HTML, )
def output_difference(self, example, got, optionflags): want = example.want if not want.strip(): return LHTMLOutputChecker.output_difference( self, example, got, optionflags) # Dang, this isn't as easy to override as we might wish original = want for transformer in self.transformers: want = transformer(want) got = transformer(got) # temporarily hack example with normalized want: example.want = want result = LHTMLOutputChecker.output_difference(self, example, got, optionflags) example.want = original # repeat lines with a diff, otherwise it's wading through mud difflines = [l for l in result.splitlines() if '(got:' in l] if difflines: result += '\nLines with differences:\n' + '\n'.join(difflines) return result
class HTML(object): """ A class wrapping HTML for better comparison and nicer error reporting. """ def __init__(self, text): self.text = text self.example = doctest.Example('', self.text) self.checker = LHTMLOutputChecker() self.flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS self.print_diff = True def compare(self, other, expect_eq): if isinstance(other, HTML): text = other.text else: text = other eq = self.checker.check_output(self.text, text, self.flags) if self.print_diff and eq != expect_eq: print self.checker.output_difference(self.example, text, self.flags) # Only output diff once per HTML object. self.print_diff = False return eq def __eq__(self, other): return self.compare(other, True) def __ne__(self, other): return self.compare(other, False) def __str__(self): return str(self.text) def __unicode__(self): return unicode(self.text)
def assertHTML(self, want, got): """Assert the want and the got are equal HTML strings. Uses lxml's LHTMLOutputChecker class, which handles minor differences in HTML documents, like differences in whitespace that don't affect the equality of the HTML.""" if not isinstance(got, basestring): got = unicode(got) checker = LHTMLOutputChecker() try: self.assertTrue(checker.check_output(want, got, PARSE_HTML)) except AssertionError: print "Wanted: %s" % want print "Got: %s" % got raise AssertionError
def get(self, url): status_codes = [] content = [] full_url = '{}/a/{}/{}'.format(self.base_url, self.domain, url) content_type = None for i, s in enumerate(self.sessions): resp = s.get(full_url) status_codes.append(resp.status_code) content.append(resp.text) content_type = resp.headers.get('content-type') self.stdout('\n{}\n{}'.format(full_url, status_codes)) if not len(set(status_codes)) == 1: self.print_diff(url, 'status_code', status_codes) if content[0] != content[1]: if content_type == 'application/json': diff = json_delta.diff(json.loads(content[0]), json.loads(content[1]), verbose=False) pprint(diff, indent='8') else: try: _check_shared(content[0], content[1], LHTMLOutputChecker(), "html") except AssertionError as e: self.stderr(str(e))
def check_output(self, want, got, optionflags): if got == want: return True for transformer in self.transformers: want = transformer(want) got = transformer(got) return LHTMLOutputChecker.check_output(self, want, got, optionflags)
def output_difference(self, example, got, optionflags): want = example.want if not want.strip(): return LHTMLOutputChecker.output_difference( self, example, got, optionflags) # Dang, this isn't as easy to override as we might wish original = want for transformer in self.transformers: want = transformer(want) got = transformer(got) # temporarily hack example with normalized want: example.want = want result = LHTMLOutputChecker.output_difference(self, example, got, optionflags) example.want = original return result
def output_difference(self, example, got, optionflags): want = example.want if not want.strip(): return LHTMLOutputChecker.output_difference( self, example, got, optionflags) # Dang, this isn't as easy to override as we might wish original = want for transformer in self.transformers: want = transformer(want) got = transformer(got) # temporarily hack example with normalized want: example.want = want result = LHTMLOutputChecker.output_difference( self, example, got, optionflags) example.want = original return result
class HTML(object): """ A class wrapping HTML for better comparison and nicer error reporting. """ def __init__(self, text): self.text = text self.example = doctest.Example('', self.text) self.checker = LHTMLOutputChecker() self.flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS self.print_diff = True def compare(self, other, expect_eq): if isinstance(other, HTML): text = other.text else: text = other eq = self.checker.check_output(self.text, text, self.flags) if self.print_diff and eq != expect_eq: print( self.checker.output_difference(self.example, text, self.flags)) # Only output diff once per HTML object. self.print_diff = False return eq def __eq__(self, other): return self.compare(other, True) def __ne__(self, other): return self.compare(other, False) def __str__(self): return str(self.text) def __unicode__(self): return str(self.text)
def runTest(self): self.parse() if self.ignore: # We've marked this test to be ignored. return kw = {} for name in self.options: if name.startswith('-'): kw[name[1:]] = False else: kw[name] = True if kw.get('clean', True): transformed = Cleaner(**kw).clean_html(self.input) else: transformed = self.input assert self.expect is not None, ("No expected output in %s" % self.filename) checker = LHTMLOutputChecker() if not checker.check_output(self.expect, transformed, 0): result = checker.output_difference(DummyInput(want=self.expect), transformed, 0) #result += '\noptions: %s %r' % (', '.join(self.options), kw) #result += repr(transformed) raise Exception("\n" + result)
def runTest(self): self.parse() if self.ignore: # We've marked this test to be ignored. return kw = {} for name in self.options: if name.startswith('-'): kw[name[1:]] = False else: kw[name] = True if kw.get('clean', True): transformed = Cleaner(**kw).clean_html(self.input) else: transformed = self.input assert self.expect is not None, ( "No expected output in %s" % self.filename) checker = LHTMLOutputChecker() if not checker.check_output(self.expect, transformed, 0): result = checker.output_difference( DummyInput(want=self.expect), transformed, 0) #result += '\noptions: %s %r' % (', '.join(self.options), kw) #result += repr(transformed) raise Exception("\n"+result)
def __init__(self, text): self.text = text self.example = doctest.Example('', self.text) self.checker = LHTMLOutputChecker() self.flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS self.print_diff = True
def assertHtmlEqual(self, expected, actual, normalize=True): if normalize: expected = parse_normalize(expected, is_html=True) actual = parse_normalize(actual, is_html=True) _check_shared(expected, actual, LHTMLOutputChecker(), "html")