def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = tokenizer.cleanup_tree(src_tree)
        self.assertIn(b'__START_ORG__', tostring(src_tree))
        self.assertNotIn(b'__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(
            new_tree,
            html_document_fromstring(UNANNOTATED_HTML)
        )

        html_tokens, _ = tokenizer.tokenize_single(new_tree)
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(
            detokenized_tree,
            html_document_fromstring(ANNOTATED_HTML)
        )
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)
Example #2
0
    def test_wa_convert_ignore_comments(self):
        tree = html_document_fromstring(b"""
        <html>
            <body>
                __START_ORG__ a
                <!--comment-->
                b __END_ORG__ cool
            </body>
        </html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(
            wa_tree_str, r"""
        <html>
            <body>
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">
                    a
                </span>
                <!--comment-->
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">
                    b
                </span>
                cool
            </body>
            <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color>
        </html>
        """)
    def test_tokenize_scripts_and_styles(self):
        html = b"""
        <html>
          <head>
            <script>function foo(){}</script>
            <style>
              body {
                color: "red"
              }
            </style>
          </head>
          <body>hello</body>
        </html>
        """

        tree = HtmlLoader().loadbytes(html)
        tree2 = html_document_fromstring(html)

        # tokenizer doesn't produce tokens for <script> and <style> contents
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 1)
        self.assertEqual(html_tokens[0].tokens, ['hello'])
        self.assertEqual(html_tokens[0].elem.tag, 'body')

        # but it preserves <script> and <style> elements
        self.assertHtmlTreeEqual(tree, tree2)

        # and restores the tree if needed
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(tree, detokenized_tree)
    def test_wa_title(self):
        tree = html_document_fromstring(b"""
        <html>
            <head><title>Foo</title></head>
            <body>contents</body>
            <wa-title><b>hello</b>, world</wa-title>
        </html>
        """)
        webannotator.apply_wa_title(tree)

        self.assertHtmlTreeEqual(tree, html_document_fromstring(b"""
        <html>
            <head><title><b>hello</b>, world</title></head>
            <body>contents</body>
        </html>
        """))
    def test_wa_convert_ignore_comments(self):
        tree = html_document_fromstring(b"""
        <html>
            <body>
                __START_ORG__ a
                <!--comment-->
                b __END_ORG__ cool
            </body>
        </html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(wa_tree_str, r"""
        <html>
            <body>
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">
                    a
                </span>
                <!--comment-->
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">
                    b
                </span>
                cool
            </body>
            <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color>
        </html>
        """)
    def test_tokenize_scripts_and_styles(self):
        html = b"""
        <html>
          <head>
            <script>function foo(){}</script>
            <style>
              body {
                color: "red"
              }
            </style>
          </head>
          <body>hello</body>
        </html>
        """

        tree = HtmlLoader().loadbytes(html)
        tree2 = html_document_fromstring(html)

        # tokenizer doesn't produce tokens for <script> and <style> contents
        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 1)
        self.assertEqual(html_tokens[0].tokens, ['hello'])
        self.assertEqual(html_tokens[0].elem.tag, 'body')

        # but it preserves <script> and <style> elements
        self.assertHtmlTreeEqual(tree, tree2)

        # and restores the tree if needed
        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertHtmlTreeEqual(tree, detokenized_tree)
Example #7
0
 def loadbytes(self, data):
     # defer cleaning the tree to prevent custom cleaners from cleaning
     # WebAnnotator markup
     tree = html_document_fromstring(data, encoding=self.encoding)
     self._fix_title(tree)
     entities = self._get_entities(tree)
     self._process_entities(entities)
     return self._cleanup_tree(tree)
Example #8
0
 def loadbytes(self, data):
     # defer cleaning the tree to prevent custom cleaners from cleaning
     # WebAnnotator markup
     tree = html_document_fromstring(data, encoding=self.encoding)
     webannotator.apply_wa_title(tree)
     if self.known_entities:
         self._prune_tags(tree)
     entities = self._get_entities(tree)
     self._process_entities(entities)
     return self._cleanup_tree(tree)
Example #9
0
 def loadbytes(self, data):
     # defer cleaning the tree to prevent custom cleaners from cleaning
     # WebAnnotator markup
     tree = html_document_fromstring(data, encoding=self.encoding)
     webannotator.apply_wa_title(tree)
     if self.known_entities:
         self._prune_tags(tree)
     entities = self._get_entities(tree)
     self._process_entities(entities)
     return self._cleanup_tree(tree)
Example #10
0
    def test_wa_convert_inner(self):
        tree = html_document_fromstring(b"""
        <html>
          <head>
            <title> __START_PER__ Hello! __END_PER__  world!</title>
          </head>
          <body>
            <p>
              __START_ORG__ Scrapinghub
                <b>Inc has</b>an
                <b>office</b>in Montevideo __END_ORG__  cool
            </p>
          </body>
        </html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(
            wa_tree_str, r"""
        <html>
          <head>
            <title>Hello! world!</title>
          </head>
          <body>
            <p>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                  Scrapinghub
              </span>
              <b>
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                    Inc has
                </span>
              </b>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                  an
              </span>
              <b>
                  <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                      office
                  </span>
              </b>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                  in Montevideo
              </span>
               cool
            </p>
          </body>
          <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_PER" type="PER"></wa-color>
          <wa-color id="WA-color-1" bg="#FF0000" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color>
          <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;">
            <span class="WebAnnotator_PER" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="PER">Hello!</span> world!
          </wa-title>
        </html>
        """)
 def test_baseurl_exists(self):
     html = b"""
     <html>
         <head><base href="http://example.com/foo"/></head>
         <body><p>hello</p></body>
     </html>
     """
     tree = html_document_fromstring(html)
     wa_tree = webannotator.to_webannotator(tree,
                                            url='http://example.com/bar')
     self.assertHtmlEqual(tostring(wa_tree), html)
    def test_wa_convert_inner(self):
        tree = html_document_fromstring(b"""
        <html>
          <head>
            <title> __START_PER__ Hello! __END_PER__  world!</title>
          </head>
          <body>
            <p>
              __START_ORG__ Scrapinghub
                <b>Inc has</b>an
                <b>office</b>in Montevideo __END_ORG__  cool
            </p>
          </body>
        </html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(wa_tree_str, r"""
        <html>
          <head>
            <title>Hello! world!</title>
          </head>
          <body>
            <p>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                  Scrapinghub
              </span>
              <b>
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                    Inc has
                </span>
              </b>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                  an
              </span>
              <b>
                  <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                      office
                  </span>
              </b>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG">
                  in Montevideo
              </span>
               cool
            </p>
          </body>
          <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_PER" type="PER"></wa-color>
          <wa-color id="WA-color-1" bg="#FF0000" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color>
          <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;">
            <span class="WebAnnotator_PER" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="PER">Hello!</span> world!
          </wa-title>
        </html>
        """)
Example #13
0
 def from_htmlbytes(cls, html_bytes, encoding=None):
     colors = cls()
     tree = html_document_fromstring(html_bytes, encoding=encoding)
     for wa_color in tree.xpath('//wa-color'):
         assert wa_color.get('id').lower().startswith('wa-color-')
         idx = int(wa_color.get('id')[len("WA-color-"):])
         fg = wa_color.get('fg')
         bg = wa_color.get('bg')
         typ = wa_color.get('type')
         colors[typ] = (fg, bg, idx)
     return colors
Example #14
0
    def test_dont_tokenize_nontext_nodes(self):
        html = b"""
          <body>
              <?xml version="1.0" encoding="UTF-8" standalone="no"?>
          </body>
        """

        tree = html_document_fromstring(html)
        tokenizer = HtmlTokenizer()
        html_tokens, _ = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 0)
Example #15
0
 def test_baseurl_exists(self):
     html = b"""
     <html>
         <head><base href="http://example.com/foo"/></head>
         <body><p>hello</p></body>
     </html>
     """
     tree = html_document_fromstring(html)
     wa_tree = webannotator.to_webannotator(tree,
                                            url='http://example.com/bar')
     self.assertHtmlEqual(tostring(wa_tree), html)
Example #16
0
    def test_detokenize_single(self):
        src_tree = self._load()
        orig_src_tree = deepcopy(src_tree)

        tokenizer = HtmlTokenizer()
        html_tokens, tags = tokenizer.tokenize_single(src_tree)
        new_tree = html_tokens[0].root
        self.assertIn(b'__START_ORG__', tostring(src_tree))
        self.assertNotIn(b'__START_ORG__', tostring(new_tree))

        self.assertHtmlTreeEqual(new_tree,
                                 html_document_fromstring(UNANNOTATED_HTML))

        detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
        self.assertIn(b'__START_ORG__', tostring(detokenized_tree))

        self.assertHtmlTreeEqual(detokenized_tree,
                                 html_document_fromstring(ANNOTATED_HTML))
        self.assertHtmlTreeEqual(detokenized_tree, orig_src_tree)
        self.assertHtmlTreeEqual(detokenized_tree, src_tree)
    def test_dont_tokenize_nontext_nodes(self):
        html = b"""
          <body>
              <?xml version="1.0" encoding="UTF-8" standalone="no"?>
          </body>
        """

        tree = html_document_fromstring(html)
        tokenizer = HtmlTokenizer()
        html_tokens, _ = tokenizer.tokenize_single(tree)
        self.assertEqual(len(html_tokens), 0)
Example #18
0
 def from_htmlbytes(cls, html_bytes, encoding=None):
     colors = cls()
     tree = html_document_fromstring(html_bytes, encoding=encoding)
     for wa_color in tree.xpath('//wa-color'):
         assert wa_color.get('id').lower().startswith('wa-color-')
         idx = int(wa_color.get('id')[len("WA-color-"):])
         fg = wa_color.get('fg')
         bg = wa_color.get('bg')
         typ = wa_color.get('type')
         colors[typ] = (fg, bg, idx)
     return colors
 def test_handle_nonxml_attributes(self):
     html = b"""
     <html>
       <body>
         <a class="addthis_button_facebook_like" like:layout="button_count">
       </body>
     </html>
     """
     tree = html_document_fromstring(html)
     wa_tree = webannotator.to_webannotator(tree)
     wa_tree_str = tostring(wa_tree)
     self.assertHtmlEqual(wa_tree_str, html)
Example #20
0
 def test_handle_nonxml_attributes(self):
     html = b"""
     <html>
       <body>
         <a class="addthis_button_facebook_like" like:layout="button_count">
       </body>
     </html>
     """
     tree = html_document_fromstring(html)
     wa_tree = webannotator.to_webannotator(tree)
     wa_tree_str = tostring(wa_tree)
     self.assertHtmlEqual(wa_tree_str, html)
    def test_wa_convert_no_title(self):
        tree = html_document_fromstring(b"""
        <html><body><p> __START_ORG__ Scrapinghub __END_ORG__ </p></body></html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(wa_tree_str, br"""
        <html>
          <body>
            <p>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">Scrapinghub</span>
            </p>
          </body>
          <wa-color bg="#33CCFF" class="WebAnnotator_ORG" fg="#000000" id="WA-color-0" type="ORG"></wa-color>
        </html>
        """)
Example #22
0
    def test_wa_convert_no_title(self):
        tree = html_document_fromstring(b"""
        <html><body><p> __START_ORG__ Scrapinghub __END_ORG__ </p></body></html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(
            wa_tree_str, r"""
        <html>
          <body>
            <p>
              <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">Scrapinghub</span>
            </p>
          </body>
          <wa-color bg="#33CCFF" class="WebAnnotator_ORG" fg="#000000" id="WA-color-0" type="ORG"></wa-color>
        </html>
        """)
Example #23
0
    def test_wa_convert_crosstitle(self):
        tree = html_document_fromstring(b"""
        <html>
            <head>
                <title>
                     __START_ORG__ a __END_ORG__  b  __START_ORG__ a
                </title>
            </head>
            <body>
                a __END_ORG__ a
            </body>
        </html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(
            wa_tree_str, r"""
        <html>
            <head>
                <title>
                    a b a
                </title>
            </head>
            <body>
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG">
                    a
                </span>
                a
            </body>
            <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color>
            <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;">
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">
                    a
                </span>
                b
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG">
                    a
                </span>
          </wa-title>
        </html>
        """)
    def test_wa_convert_crosstitle(self):
        tree = html_document_fromstring(b"""
        <html>
            <head>
                <title>
                     __START_ORG__ a __END_ORG__  b  __START_ORG__ a
                </title>
            </head>
            <body>
                a __END_ORG__ a
            </body>
        </html>
        """)
        wa_tree = webannotator.to_webannotator(tree)
        wa_tree_str = tostring(wa_tree)

        self.assertHtmlEqual(wa_tree_str, r"""
        <html>
            <head>
                <title>
                    a b a
                </title>
            </head>
            <body>
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG">
                    a
                </span>
                a
            </body>
            <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color>
            <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;">
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">
                    a
                </span>
                b
                <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG">
                    a
                </span>
          </wa-title>
        </html>
        """)
Example #25
0
 def loadbytes(self, data):
     tree = html_document_fromstring(data, self.encoding)
     return self.cleaner.clean_html(tree)
Example #26
0
 def loadbytes(self, data):
     tree = html_document_fromstring(data, self.encoding)
     return self.cleaner.clean_html(tree)
 def assertApplyWaTitle(self, source, result):
     tree = html_document_fromstring(source)
     webannotator.apply_wa_title(tree)
     self.assertHtmlTreeEqual(tree, html_document_fromstring(result))
Example #28
0
 def assertApplyWaTitle(self, source, result):
     tree = html_document_fromstring(source)
     webannotator.apply_wa_title(tree)
     self.assertHtmlTreeEqual(tree, html_document_fromstring(result))