def __eq__(self, o: object) -> bool: """Compares the HOCRNode to another object The problem with comparing HTML is that minor differences in markup still represent the same tree with the same elements. lxml has a utility meant to make output checking in doctests more readable by comparing the functional equivalency. Read here: https://lxml.de/lxmlhtml.html#running-html-doctests Though this isn't a doctest, this functionality is essentially what is needed to compare two nodes. The comparator lives in lxml.doctestcompare.LHTMLOutputChecker, which is used with the PARSE_HTML optionflag. The following is considered functionally equivalent by the output checker and will therefore evaluate as true: - Different order of attributes - Repeated spaces inside a tag - Whitespace between tags """ if not isinstance(o, HOCRNode): return False checker = LHTMLOutputChecker() return checker.check_output( want=lxml.etree.tostring(self), got=lxml.etree.tostring(o), optionflags=PARSE_HTML, )
class HTML(object): """ A class wrapping HTML for better comparison and nicer error reporting. """ def __init__(self, text): self.text = text self.example = doctest.Example('', self.text) self.checker = LHTMLOutputChecker() self.flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS self.print_diff = True def compare(self, other, expect_eq): if isinstance(other, HTML): text = other.text else: text = other eq = self.checker.check_output(self.text, text, self.flags) if self.print_diff and eq != expect_eq: print self.checker.output_difference(self.example, text, self.flags) # Only output diff once per HTML object. self.print_diff = False return eq def __eq__(self, other): return self.compare(other, True) def __ne__(self, other): return self.compare(other, False) def __str__(self): return str(self.text) def __unicode__(self): return unicode(self.text)
def check_output(self, want, got, optionflags): if got == want: return True for transformer in self.transformers: want = transformer(want) got = transformer(got) return LHTMLOutputChecker.check_output(self, want, got, optionflags)
def check_output(self, want, got, optionflags): if got == want: return True for transformer in self.transformers: want = transformer(want) got = transformer(got) return LHTMLOutputChecker.check_output(self, want, got, optionflags)
def assertHTML(self, want, got): """Assert the want and the got are equal HTML strings. Uses lxml's LHTMLOutputChecker class, which handles minor differences in HTML documents, like differences in whitespace that don't affect the equality of the HTML.""" if not isinstance(got, basestring): got = unicode(got) checker = LHTMLOutputChecker() try: self.assertTrue(checker.check_output(want, got, PARSE_HTML)) except AssertionError: print "Wanted: %s" % want print "Got: %s" % got raise AssertionError
class HTML(object): """ A class wrapping HTML for better comparison and nicer error reporting. """ def __init__(self, text): self.text = text self.example = doctest.Example('', self.text) self.checker = LHTMLOutputChecker() self.flags = doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS self.print_diff = True def compare(self, other, expect_eq): if isinstance(other, HTML): text = other.text else: text = other eq = self.checker.check_output(self.text, text, self.flags) if self.print_diff and eq != expect_eq: print( self.checker.output_difference(self.example, text, self.flags)) # Only output diff once per HTML object. self.print_diff = False return eq def __eq__(self, other): return self.compare(other, True) def __ne__(self, other): return self.compare(other, False) def __str__(self): return str(self.text) def __unicode__(self): return str(self.text)
def runTest(self): self.parse() if self.ignore: # We've marked this test to be ignored. return kw = {} for name in self.options: if name.startswith('-'): kw[name[1:]] = False else: kw[name] = True if kw.get('clean', True): transformed = Cleaner(**kw).clean_html(self.input) else: transformed = self.input assert self.expect is not None, ("No expected output in %s" % self.filename) checker = LHTMLOutputChecker() if not checker.check_output(self.expect, transformed, 0): result = checker.output_difference(DummyInput(want=self.expect), transformed, 0) #result += '\noptions: %s %r' % (', '.join(self.options), kw) #result += repr(transformed) raise Exception("\n" + result)
def runTest(self): self.parse() if self.ignore: # We've marked this test to be ignored. return kw = {} for name in self.options: if name.startswith('-'): kw[name[1:]] = False else: kw[name] = True if kw.get('clean', True): transformed = Cleaner(**kw).clean_html(self.input) else: transformed = self.input assert self.expect is not None, ( "No expected output in %s" % self.filename) checker = LHTMLOutputChecker() if not checker.check_output(self.expect, transformed, 0): result = checker.output_difference( DummyInput(want=self.expect), transformed, 0) #result += '\noptions: %s %r' % (', '.join(self.options), kw) #result += repr(transformed) raise Exception("\n"+result)