def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual( new_tree, html_document_fromstring(UNANNOTATED_HTML) ) html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual( detokenized_tree, html_document_fromstring(ANNOTATED_HTML) ) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)
def test_tokenize_scripts_and_styles(self): html = b""" <html> <head> <script>function foo(){}</script> <style> body { color: "red" } </style> </head> <body>hello</body> </html> """ tree = HtmlLoader().loadbytes(html) tree2 = html_document_fromstring(html) # tokenizer doesn't produce tokens for <script> and <style> contents tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(tree) self.assertEqual(len(html_tokens), 1) self.assertEqual(html_tokens[0].tokens, ['hello']) self.assertEqual(html_tokens[0].elem.tag, 'body') # but it preserves <script> and <style> elements self.assertHtmlTreeEqual(tree, tree2) # and restores the tree if needed detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(tree, detokenized_tree)
def test_dont_tokenize_nontext_nodes(self): html = b""" <body> <?xml version="1.0" encoding="UTF-8" standalone="no"?> </body> """ tree = html_document_fromstring(html) tokenizer = HtmlTokenizer() html_tokens, _ = tokenizer.tokenize_single(tree) self.assertEqual(len(html_tokens), 0)
def test_detokenize_preserve_commas(self): annotated_html = b""" <html> <body> __START_ORG__ hello __END_ORG__ a, b <a>world</a></body> </html> """ annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
def test_detokenize_handle_unicode(self): annotated_html = bytes(u""" <html> <body>Δ __START_ORG__ hello __END_ORG__ a, b <a>world</a></body> </html> """.encode('utf-8')) annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
def test_tokenization_doesnt_alter_tree(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) HtmlTokenizer().tokenize_single(src_tree) # original tree is not changed self.assertHtmlTreeEqual(src_tree, orig_src_tree)
def test_detokenize_single(self): src_tree = self._load() orig_src_tree = deepcopy(src_tree) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) new_tree = html_tokens[0].root self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) self.assertHtmlTreeEqual(new_tree, html_document_fromstring(UNANNOTATED_HTML)) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) self.assertHtmlTreeEqual(detokenized_tree, html_document_fromstring(ANNOTATED_HTML)) self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree) self.assertHtmlTreeEqual(detokenized_tree, src_tree)
def __init__(self, model, loader=None, html_tokenizer=None, entity_colors=None): self.model = model self.loader = loader or HtmlLoader() self.html_tokenizer = html_tokenizer or HtmlTokenizer() if entity_colors is None: entity_colors = EntityColors() self.entity_colors = entity_colors
def assertTokenizationWorks(self, tree): html_tokens, tags = HtmlTokenizer().tokenize_single(tree) # data is correct self.assertListEqual( [t.token for t in html_tokens], [ u'Scrapinghub', u'Inc', u'has', u'an', u'office', u'in', u'Montevideo' ], ) self.assertListEqual( tags, [u'B-ORG', u'I-ORG', 'O', 'O', 'O', 'O', u'B-CITY'])
def test_detokenize_single_empty(self): self.assertIs(HtmlTokenizer().detokenize_single([], []), None)
'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table', 'tfoot', 'ul', 'video', # not really block, but makes sense to include 'li', 'body', } html_tokenizer = HtmlTokenizer() def get_text_blocks(tree): tokens, _ = html_tokenizer.tokenize_single(tree) text_blocks = [] prev_parent = None current = [] for token in tokens: parent = token.parent while parent.tag not in BLOCK_TAGS: parent = parent.getparent() if prev_parent is not None and prev_parent != parent: text_blocks.append((prev_parent.tag, ' '.join(current))) current = [] current.append(token.token)