Ejemplo n.º 1
0
def htmldiff(old_html, new_html):
    try:
        old_html_tokens = tokenize(old_html, include_hrefs=False)
        new_html_tokens = tokenize(new_html, include_hrefs=False)
    except (KeyError, ParserError):
        return new_html

    result = htmldiff_tokens(old_html_tokens, new_html_tokens)
    result = ''.join(result).strip()

    return fixup_ins_del_tags(result)
Ejemplo n.º 2
0
def htmldiff(old_html, new_html):
    try:
        old_html_tokens = tokenize(old_html, include_hrefs=False) 
        new_html_tokens = tokenize(new_html, include_hrefs=False) 
    except (KeyError, ParserError):
        return new_html
    
    result = htmldiff_tokens(old_html_tokens, new_html_tokens) 
    result = ''.join(result).strip() 
    
    return fixup_ins_del_tags(result)
Ejemplo n.º 3
0
def htmldiff(old_html, new_html):
    """ Modified lxml.html.diff.htmldiff:
    * include_hrefs=False - it's hard to fix this " Link: href " stuff,
      and it's not needed (right?)
    * do not do fixup_ins_del_tags, as it re-parses everything
     and we don't need it here
    """
    old_html_tokens = lxml_diff.tokenize(old_html, include_hrefs=False)
    new_html_tokens = lxml_diff.tokenize(new_html, include_hrefs=False)
    result = lxml_diff.htmldiff_tokens(old_html_tokens, new_html_tokens)
    result = ''.join(result).strip()
    return result
Ejemplo n.º 4
0
    def compare_html(self, original_text, output_text):
        """Do a diff of two HTML files.

        Only the text, <img> tags and <a href=***> attributes in the HTML are diffed.
        """
        # We start with a diff of the text tokens alone. This allows us to check that the content is the same (weather or not some non-visible structural elements may have been added/removed/modified)
        old_html_tokens = tokenize(output_text)
        new_html_tokens = tokenize(original_text)
        s = InsensitiveSequenceMatcher(a=old_html_tokens, b=new_html_tokens)
        commands = s.get_opcodes()
        # If the content is the same it will only have one opcode which states that the objects are equal
        self.assertEqual(len(commands), 1)
        self.assertEqual('equal', commands[0][0])
        # Now we do the real test of equality between the original and the de-encapsulated copy
        self.assertEqual(original_text, output_text)