Esempio n. 1
0
    def assertTreeDiff(self, html1, html2, expected):
        """
        Asserts that the given HTML strings will produce a tree_diff of the
        expected HTML string.
        """
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        tree1 = document_fromstring('<html><body>%s</body></html>' % html1)
        tree2 = document_fromstring('<html><body>%s</body></html>' % html2)
        expected = '<html><body>%s</body></html>' % expected

        result_tree = tree_diff(preprocess(tree1), preprocess(tree2), self.algorithm)
        got = etree.tostring(result_tree)
        self.assertEqual(got, expected)
Esempio n. 2
0
    def assertTreeDiff(self, html1, html2, expected):
        """
        Asserts that the given HTML strings will produce a tree_diff of the
        expected HTML string.
        """
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        tree1 = document_fromstring("<html><body>%s</body></html>" % html1)
        tree2 = document_fromstring("<html><body>%s</body></html>" % html2)
        expected = "<html><body>%s</body></html>" % expected

        result_tree = tree_diff(preprocess(tree1), preprocess(tree2), self.algorithm)
        got = etree.tostring(result_tree)
        self.assertEqual(got, expected)
Esempio n. 3
0
    def assertPreprocess(self, html, expected, **kwargs):
        # The test strings should *not* have <html> and <body> tags, for the
        # sake of brevity.
        html = '<html><body>%s</body></html>' % html
        expected = '<html><body>%s</body></html>' % expected

        result_tree = preprocess(document_fromstring(html), **kwargs)
        got = etree.tostring(result_tree)
        self.assertEqual(got, expected)
Esempio n. 4
0
def mine_page(html, other_pages):
    result = []
    for hole in extract(html, other_pages):
        # Differences in attribute values aren't relevant.
        if hole['type'] == 'attrib' or not hole['value'] or not hole['value'].strip():
            continue

        # # Differences in links are likely navigation, and can be ignored.
        # if hole['type'] == 'text' and hole['tag'] == 'a':
        #     continue

        # If it's a multitag value, clean its HTML a bit.
        if hole['type'] == 'multitag':
            tree = make_tree_and_preprocess(hole['value'])

            # Drop a bunch of tags that can muck up the display.
            tree = preprocess(tree,
                drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'sub', 'sup', 'topic'),
                drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
                drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))

            remove_empty_tags(tree, ('br',))
            tree = brs_to_paragraphs(tree)

            # The [6:-7] cuts off the '<body>' and '</body>'.
            try:
                body = tree.body
            except IndexError:
                continue # lxml raises an IndexError if there's no <body>.

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', body.text_content()):
                continue

            string = etree.tostring(body, method='html')[6:-7]
        else:
            string = hole['value']

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', string):
                continue

        # Clean up newlines, tabs and &nbsp;.
        string = re.sub('[\n\t]', ' ', string.strip())
        string = string.replace('&nbsp;', ' ')
        string = string.replace('&#160;', ' ')

        result.append(string)
    return result
Esempio n. 5
0
def mine_page(html, other_pages):
    result = []
    for hole in extract(html, other_pages):
        # Differences in attribute values aren't relevant.
        if hole['type'] == 'attrib' or not hole['value'] or not hole['value'].strip():
            continue

        # # Differences in links are likely navigation, and can be ignored.
        # if hole['type'] == 'text' and hole['tag'] == 'a':
        #     continue

        # If it's a multitag value, clean its HTML a bit.
        if hole['type'] == 'multitag':
            tree = make_tree_and_preprocess(hole['value'])

            # Drop a bunch of tags that can muck up the display.
            tree = preprocess(tree,
                drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'sub', 'sup', 'topic'),
                drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
                drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))

            remove_empty_tags(tree, ('br',))
            tree = brs_to_paragraphs(tree)

            # The [6:-7] cuts off the '<body>' and '</body>'.
            try:
                body = tree.body
            except IndexError:
                continue # lxml raises an IndexError if there's no <body>.

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', body.text_content()):
                continue

            string = etree.tostring(body, method='html')[6:-7]
        else:
            string = hole['value']

            # Skip bits that don't have at least one letter or number.
            # Note: If this code is ever internationalized, this will have to be
            # removed.
            if not re.search('[A-Za-z0-9]', string):
                continue

        # Clean up newlines, tabs and &nbsp;.
        string = re.sub('[\n\t]', ' ', string.strip())
        string = string.replace('&nbsp;', ' ')
        string = string.replace('&#160;', ' ')

        result.append(string)
    return result
Esempio n. 6
0
    def extract_data(self, blob):
        from lxml.etree import tostring
        from lxml.html import document_fromstring
        from ebdata.textmining.treeutils import preprocess

        tree = document_fromstring(blob.html).xpath("//div[@id='contents']")[0]
        tree = preprocess(tree,
            drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'),
            drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'),
            drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target'))
        html = tostring(tree, method='html')
        # Remove non breaking spaces (&nbsp; and &#160;) so tagging regexes
        # can be less complicated
        return html.replace('&nbsp;', ' ').replace('&#160;', ' ')
Esempio n. 7
0
    def extract_data(self, blob):
        from lxml.etree import tostring
        from lxml.html import document_fromstring
        from ebdata.textmining.treeutils import preprocess

        tree = document_fromstring(blob.html).xpath("//div[@id='contents']")[0]
        tree = preprocess(tree,
                          drop_tags=('a', 'area', 'b', 'center', 'font',
                                     'form', 'img', 'input', 'map', 'small',
                                     'span', 'sub', 'sup', 'topic', 'u'),
                          drop_trees=('applet', 'button', 'embed', 'iframe',
                                      'object', 'select', 'textarea'),
                          drop_attrs=('background', 'border', 'cellpadding',
                                      'cellspacing', 'class', 'clear', 'id',
                                      'rel', 'style', 'target'))
        html = tostring(tree, method='html')
        # Remove non breaking spaces (&nbsp; and &#160;) so tagging regexes
        # can be less complicated
        return html.replace('&nbsp;', ' ').replace('&#160;', ' ')
Esempio n. 8
0
 def assertPreprocesses(self, html, expected, **kwargs):
     tree = make_tree(html)
     got = etree.tostring(preprocess(tree, **kwargs), method='html')
     self.assertEqual(got, expected)
Esempio n. 9
0
 def assertPreprocesses(self, html, expected, **kwargs):
     import warnings
     with warnings.catch_warnings():
         tree = make_tree(html)
         got = etree.tostring(preprocess(tree, **kwargs), method='html')
         self.assertEqual(got, expected)
Esempio n. 10
0
 def assertPreprocesses(self, html, expected, **kwargs):
     tree = make_tree(html)
     got = etree.tostring(preprocess(tree, **kwargs), method='html')
     self.assertEqual(got, expected)
Esempio n. 11
0
 def assertPreprocesses(self, html, expected, **kwargs):
     import warnings
     with warnings.catch_warnings():
         tree = make_tree(html)
         got = etree.tostring(preprocess(tree, **kwargs), method='html')
         self.assertEqual(got, expected)