コード例 #1
0
    def test_tokenize_scripts_and_styles(self):
        html = b"""
        <html>
          <head>
            <script>function foo(){}</script>
            <style>
              body {
                color: "red"
              }
            </style>
          </head>
          <body>hello</body>
        </html>
        """

        tree = HtmlLoader().loadbytes(html)
        tree2 = html_document_fromstring(html)

        # tokenizer doesn't produce tokens for <script> and <style> contents
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 1)
        self.assertEqual(html_tokens[0].tokens, ['hello'])
        self.assertEqual(html_tokens[0].elem.tag, 'body')

        # but it preserves <script> and <style> elements
        self.assertHtmlTreeEqual(tree, tree2)

        # and restores the tree if needed
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(tree, detokenized_tree)
コード例 #2
0
    def test_tokenize_scripts_and_styles(self):
        html = b"""
        <html>
          <head>
            <script>function foo(){}</script>
            <style>
              body {
                color: "red"
              }
            </style>
          </head>
          <body>hello</body>
        </html>
        """

        tree = HtmlLoader().loadbytes(html)
        tree2 = html_document_fromstring(html)

        # tokenizer doesn't produce tokens for <script> and <style> contents
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 1)
        self.assertEqual(html_tokens[0].tokens, ['hello'])
        self.assertEqual(html_tokens[0].elem.tag, 'body')

        # but it preserves <script> and <style> elements
        self.assertHtmlTreeEqual(tree, tree2)

        # and restores the tree if needed
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(tree, detokenized_tree)
コード例 #3
0
    def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = html_tokens[0].root
        self.assertIn('__START_ORG__', tostring(src_tree))
        self.assertNotIn('__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(new_tree,
                                 html_document_fromstring(UNANNOTATED_HTML))

        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn('__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(detokenized_tree,
                                 html_document_fromstring(ANNOTATED_HTML))
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)
コード例 #4
0
    def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = html_tokens[0].root
        self.assertIn(b'__START_ORG__', tostring(src_tree))
        self.assertNotIn(b'__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(
            new_tree,
            html_document_fromstring(UNANNOTATED_HTML)
        )

        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(
            detokenized_tree,
            html_document_fromstring(ANNOTATED_HTML)
        )
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)