def test_wa_convert_ignore_comments(self): tree = html_document_fromstring(b""" <html> <body> __START_ORG__ a <!--comment--> b __END_ORG__ cool </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> <!--comment--> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> b </span> cool </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> </html> """)
def test_wa_convert_ignore_comments(self): tree = html_document_fromstring(b""" <html> <body> __START_ORG__ a <!--comment--> b __END_ORG__ cool </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, r""" <html> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> <!--comment--> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> b </span> cool </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> </html> """)
def annotate(self, bytes_data, pretty_print=False): """ Return annotated HTML data in WebAnnotator format. """ html_tokens, tags = self.extract_raw(bytes_data) tree = self.html_tokenizer.detokenize_single(html_tokens, tags) tree = to_webannotator(tree, self.entity_colors) return tostring(tree, pretty_print=pretty_print)
def annotate(self, bytes_data, url=None, pretty_print=False): """ Return annotated HTML data in WebAnnotator format. """ html_tokens, tags = self.extract_raw(bytes_data) tree = self.html_tokenizer.detokenize_single(html_tokens, tags) tree = to_webannotator(tree, entity_colors=self.entity_colors, url=url) return tostring(tree, pretty_print=pretty_print)
def test_wa_convert_inner(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_PER__ Hello! __END_PER__ world!</title> </head> <body> <p> __START_ORG__ Scrapinghub <b>Inc has</b>an <b>office</b>in Montevideo __END_ORG__ cool </p> </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <head> <title>Hello! world!</title> </head> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Scrapinghub </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Inc has </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> an </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> office </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> in Montevideo </span> cool </p> </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_PER" type="PER"></wa-color> <wa-color id="WA-color-1" bg="#FF0000" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_PER" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="PER">Hello!</span> world! </wa-title> </html> """)
def test_baseurl_exists(self): html = b""" <html> <head><base href="http://example.com/foo"/></head> <body><p>hello</p></body> </html> """ tree = html_document_fromstring(html) wa_tree = webannotator.to_webannotator(tree, url='http://example.com/bar') self.assertHtmlEqual(tostring(wa_tree), html)
def test_wa_convert_inner(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_PER__ Hello! __END_PER__ world!</title> </head> <body> <p> __START_ORG__ Scrapinghub <b>Inc has</b>an <b>office</b>in Montevideo __END_ORG__ cool </p> </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, r""" <html> <head> <title>Hello! world!</title> </head> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Scrapinghub </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> Inc has </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> an </span> <b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> office </span> </b> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#FF0000;" wa-id="1" wa-subtypes="" wa-type="ORG"> in Montevideo </span> cool </p> </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_PER" type="PER"></wa-color> <wa-color id="WA-color-1" bg="#FF0000" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_PER" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="PER">Hello!</span> world! </wa-title> </html> """)
def test_handle_nonxml_attributes(self): html = b""" <html> <body> <a class="addthis_button_facebook_like" like:layout="button_count"> </body> </html> """ tree = html_document_fromstring(html) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, html)
def test_wa_convert_no_title(self): tree = html_document_fromstring(b""" <html><body><p> __START_ORG__ Scrapinghub __END_ORG__ </p></body></html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, br""" <html> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">Scrapinghub</span> </p> </body> <wa-color bg="#33CCFF" class="WebAnnotator_ORG" fg="#000000" id="WA-color-0" type="ORG"></wa-color> </html> """)
def test_wa_convert_no_title(self): tree = html_document_fromstring(b""" <html><body><p> __START_ORG__ Scrapinghub __END_ORG__ </p></body></html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <body> <p> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG">Scrapinghub</span> </p> </body> <wa-color bg="#33CCFF" class="WebAnnotator_ORG" fg="#000000" id="WA-color-0" type="ORG"></wa-color> </html> """)
def test_wa_convert_crosstitle(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_ORG__ a __END_ORG__ b __START_ORG__ a </title> </head> <body> a __END_ORG__ a </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual( wa_tree_str, r""" <html> <head> <title> a b a </title> </head> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> a </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> b <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> </wa-title> </html> """)
def test_wa_convert_crosstitle(self): tree = html_document_fromstring(b""" <html> <head> <title> __START_ORG__ a __END_ORG__ b __START_ORG__ a </title> </head> <body> a __END_ORG__ a </body> </html> """) wa_tree = webannotator.to_webannotator(tree) wa_tree_str = tostring(wa_tree) self.assertHtmlEqual(wa_tree_str, r""" <html> <head> <title> a b a </title> </head> <body> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> a </body> <wa-color id="WA-color-0" bg="#33CCFF" fg="#000000" class="WebAnnotator_ORG" type="ORG"></wa-color> <wa-title style="box-shadow:0 0 1em black;border:2px solid blue;padding:0.5em;"> <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="0" wa-subtypes="" wa-type="ORG"> a </span> b <span class="WebAnnotator_ORG" style="color:#000000; background-color:#33CCFF;" wa-id="1" wa-subtypes="" wa-type="ORG"> a </span> </wa-title> </html> """)