def assertTreeDiff(self, html1, html2, expected): """ Asserts that the given HTML strings will produce a tree_diff of the expected HTML string. """ # The test strings should *not* have <html> and <body> tags, for the # sake of brevity. tree1 = document_fromstring('<html><body>%s</body></html>' % html1) tree2 = document_fromstring('<html><body>%s</body></html>' % html2) expected = '<html><body>%s</body></html>' % expected result_tree = tree_diff(preprocess(tree1), preprocess(tree2), self.algorithm) got = etree.tostring(result_tree) self.assertEqual(got, expected)
def assertTreeDiff(self, html1, html2, expected): """ Asserts that the given HTML strings will produce a tree_diff of the expected HTML string. """ # The test strings should *not* have <html> and <body> tags, for the # sake of brevity. tree1 = document_fromstring("<html><body>%s</body></html>" % html1) tree2 = document_fromstring("<html><body>%s</body></html>" % html2) expected = "<html><body>%s</body></html>" % expected result_tree = tree_diff(preprocess(tree1), preprocess(tree2), self.algorithm) got = etree.tostring(result_tree) self.assertEqual(got, expected)
def assertPreprocess(self, html, expected, **kwargs): # The test strings should *not* have <html> and <body> tags, for the # sake of brevity. html = '<html><body>%s</body></html>' % html expected = '<html><body>%s</body></html>' % expected result_tree = preprocess(document_fromstring(html), **kwargs) got = etree.tostring(result_tree) self.assertEqual(got, expected)
def mine_page(html, other_pages): result = [] for hole in extract(html, other_pages): # Differences in attribute values aren't relevant. if hole['type'] == 'attrib' or not hole['value'] or not hole['value'].strip(): continue # # Differences in links are likely navigation, and can be ignored. # if hole['type'] == 'text' and hole['tag'] == 'a': # continue # If it's a multitag value, clean its HTML a bit. if hole['type'] == 'multitag': tree = make_tree_and_preprocess(hole['value']) # Drop a bunch of tags that can muck up the display. tree = preprocess(tree, drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'sub', 'sup', 'topic'), drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'), drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target')) remove_empty_tags(tree, ('br',)) tree = brs_to_paragraphs(tree) # The [6:-7] cuts off the '<body>' and '</body>'. try: body = tree.body except IndexError: continue # lxml raises an IndexError if there's no <body>. # Skip bits that don't have at least one letter or number. # Note: If this code is ever internationalized, this will have to be # removed. if not re.search('[A-Za-z0-9]', body.text_content()): continue string = etree.tostring(body, method='html')[6:-7] else: string = hole['value'] # Skip bits that don't have at least one letter or number. # Note: If this code is ever internationalized, this will have to be # removed. if not re.search('[A-Za-z0-9]', string): continue # Clean up newlines, tabs and . string = re.sub('[\n\t]', ' ', string.strip()) string = string.replace(' ', ' ') string = string.replace(' ', ' ') result.append(string) return result
def extract_data(self, blob): from lxml.etree import tostring from lxml.html import document_fromstring from ebdata.textmining.treeutils import preprocess tree = document_fromstring(blob.html).xpath("//div[@id='contents']")[0] tree = preprocess(tree, drop_tags=('a', 'area', 'b', 'center', 'font', 'form', 'img', 'input', 'map', 'small', 'span', 'sub', 'sup', 'topic', 'u'), drop_trees=('applet', 'button', 'embed', 'iframe', 'object', 'select', 'textarea'), drop_attrs=('background', 'border', 'cellpadding', 'cellspacing', 'class', 'clear', 'id', 'rel', 'style', 'target')) html = tostring(tree, method='html') # Remove non breaking spaces ( and  ) so tagging regexes # can be less complicated return html.replace(' ', ' ').replace(' ', ' ')
def assertPreprocesses(self, html, expected, **kwargs): tree = make_tree(html) got = etree.tostring(preprocess(tree, **kwargs), method='html') self.assertEqual(got, expected)
def assertPreprocesses(self, html, expected, **kwargs): import warnings with warnings.catch_warnings(): tree = make_tree(html) got = etree.tostring(preprocess(tree, **kwargs), method='html') self.assertEqual(got, expected)