Ejemplo n.º 1
0
 def test_extraction_tag_caption_td_th(self):
     """Check that we can extract table related translatable: th, td and caption"""
     h = html.htmlfile()
     # Example form http://www.w3schools.com/tags/tag_caption.asp
     store = h.parsestring(
         """
         <table>
             <caption>Monthly savings</caption>
             <tr>
                 <th>Month</th>
                 <th>Savings</th>
             </tr>
             <tr>
                 <td>January</td>
                 <td>$100</td>
             </tr>
         </table>"""
     )
     print(store.units[0].source)
     assert len(store.units) == 5
     assert store.units[0].source == "Monthly savings"
     assert store.units[1].source == "Month"
     assert store.units[2].source == "Savings"
     assert store.units[3].source == "January"
     assert store.units[4].source == "$100"
Ejemplo n.º 2
0
    def test_extraction_attr_title(self):
        """Check that we can extract title attribute"""
        h = html.htmlfile()

        # Example form http://www.w3schools.com/tags/att_global_title.asp
        store = h.parsestring("""
            <p><abbr title="World Health Organization">WHO</abbr> was founded in 1948.</p>
            <p title="Free Web tutorials">W3Schools.com</p>""")
        print(store.units[0].source)
        assert len(store.units) == 4
        assert store.units[0].source == "World Health Organization"
        # FIXME this is not ideal we need to either drop title= as we've
        # extracted it already or not extract it earlier
        assert store.units[1].source == '<abbr title="World Health Organization">WHO</abbr> was founded in 1948.'
        assert store.units[2].source == "Free Web tutorials"
        assert store.units[3].source == "W3Schools.com"

        # Example from http://www.netmechanic.com/news/vol6/html_no1.htm
        store = h.parsestring("""
            <table width="100" border="2" title="Henry Jacobs Camp summer 2003 schedule">
        """)
        assert len(store.units) == 1
        assert store.units[0].source == "Henry Jacobs Camp summer 2003 schedule"
        # FIXME this doesn't extract as I'd have expected
        #store = h.parsestring("""
        #    <a href="page1.html" title="HS Jacobs - a UAHC camp in Utica, MS">Henry S. Jacobs Camp</a>
        #""")
        #assert len(store.units) == 2
        #assert store.units[0].source == "HS Jacobs - a UAHC camp in Utica, MS"
        #assert store.units[1].source == "Henry S. Jacobs Camp"
        store = h.parsestring("""
            <form name="application" title="Henry Jacobs camper application" method="  " action="  ">
        """)
        assert len(store.units) == 1
        assert store.units[0].source == "Henry Jacobs camper application"
Ejemplo n.º 3
0
 def mergestore(self, inputstore, templatetext, includefuzzy):
     """converts a file to .po format"""
     self.inputstore = inputstore
     self.inputstore.makeindex()
     self.includefuzzy = includefuzzy
     output_store = html.htmlfile(inputfile=templatetext, callback=self.lookup)
     return output_store.filesrc
Ejemplo n.º 4
0
 def mergestore(self, inputstore, templatetext, includefuzzy):
     """converts a file to .po format"""
     self.inputstore = inputstore
     self.inputstore.makeindex()
     self.includefuzzy = includefuzzy
     output_store = html.htmlfile(inputfile=templatetext, callback=self.lookup)
     return output_store.filesrc
Ejemplo n.º 5
0
 def test_escaping_script_and_pre(self):
     """<script> and <pre> can contain < and > and these should not be
     interpretted as tags"""
     h = html.htmlfile()
     store = h.parsestring("<p>We are here</p><script>Some </tag>like data<script></p>")
     print store.units[0].source
     assert len(store.units) == 1
Ejemplo n.º 6
0
    def test_extraction_attr_title(self):
        """Check that we can extract title attribute"""
        h = html.htmlfile()

        # Example form http://www.w3schools.com/tags/att_global_title.asp
        store = h.parsestring("""
            <p><abbr title="World Health Organization">WHO</abbr> was founded in 1948.</p>
            <p title="Free Web tutorials">W3Schools.com</p>""")
        print(store.units[0].source)
        assert len(store.units) == 3
        assert store.units[
            0].source == '<abbr title="World Health Organization">WHO</abbr> was founded in 1948.'
        assert store.units[1].source == "Free Web tutorials"
        assert store.units[2].source == "W3Schools.com"

        # Example from http://www.netmechanic.com/news/vol6/html_no1.htm
        store = h.parsestring("""
            <table width="100" border="2" title="Henry Jacobs Camp summer 2003 schedule">
        """)
        assert len(store.units) == 1
        assert store.units[
            0].source == "Henry Jacobs Camp summer 2003 schedule"

        store = h.parsestring("""
           <div><a href="page1.html" title="HS Jacobs - a UAHC camp in Utica, MS">Henry S. Jacobs Camp</a></div>
        """)
        assert len(store.units) == 2
        assert store.units[0].source == "HS Jacobs - a UAHC camp in Utica, MS"
        assert store.units[1].source == "Henry S. Jacobs Camp"

        store = h.parsestring("""
            <form name="application" title="Henry Jacobs camper application" method="  " action="  ">
        """)
        assert len(store.units) == 1
        assert store.units[0].source == "Henry Jacobs camper application"
Ejemplo n.º 7
0
def test_strip_html_with_pi():
    h = html.htmlfile()
    assert html.strip_html(
        h.pi_escape('<a href="<?$var?>">Something</a>')) == "Something"
    assert html.strip_html(
        h.pi_escape(
            '<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">Something</a>'
        )) == "Something"
Ejemplo n.º 8
0
 def test_escaping_script_and_pre(self):
     """<script> and <pre> can contain < and > and these should not be
     interpretted as tags"""
     h = html.htmlfile()
     store = h.parsestring(
         "<p>We are here</p><script>Some </tag>like data<script></p>")
     print(store.units[0].source)
     assert len(store.units) == 1
Ejemplo n.º 9
0
 def convertfile_inner(inputfile, outputstore, keepcomments):
     """Extract translation units from an html file and add to a pofile object."""
     htmlparser = html.htmlfile(inputfile=inputfile)
     for htmlunit in htmlparser.units:
         thepo = outputstore.addsourceunit(htmlunit.source)
         thepo.addlocations(htmlunit.getlocations())
         if keepcomments:
             thepo.addnote(htmlunit.getnotes(), "developer")
Ejemplo n.º 10
0
 def test_extraction_attr_alt(self):
     """Check that we can extract title attribute"""
     h = html.htmlfile()
     # Example from http://www.netmechanic.com/news/vol6/html_no1.htm
     store = h.parsestring("""
         <img src="cafeteria.jpg" height="200" width="200" alt="UAHC campers enjoy a meal in the camp cafeteria">
     """)
     assert len(store.units) == 1
     assert store.units[0].source == "UAHC campers enjoy a meal in the camp cafeteria"
Ejemplo n.º 11
0
def test_guess_encoding():
    """Read an encoding header to guess the encoding correctly"""
    h = html.htmlfile()
    assert (h.guess_encoding(
        b"""<META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-8">"""
    ) == "UTF-8")
    assert (h.guess_encoding(
        b"""<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><!-- base href="http://home.online.no/~rut-aane/linux.html" --><link rel="shortcut icon" href="http://home.online.no/~rut-aane/peng16x16a.gif"><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"><meta name="Description" content="Linux newbie stuff and a little about Watching TV under Linux"><meta name="MSSmartTagsPreventParsing" content="TRUE"><meta name="GENERATOR" content="Mozilla/4.7 [en] (X11; I; Linux 2.2.5-15 i586) [Netscape]"><title>Some Linux for beginners</title><style type="text/css">"""
    ) == "iso-8859-1")
Ejemplo n.º 12
0
 def test_extraction_attr_alt(self):
     """Check that we can extract title attribute"""
     h = html.htmlfile()
     # Example from http://www.netmechanic.com/news/vol6/html_no1.htm
     store = h.parsestring("""
         <img src="cafeteria.jpg" height="200" width="200" alt="UAHC campers enjoy a meal in the camp cafeteria">
     """)
     assert len(store.units) == 1
     assert store.units[0].source == "UAHC campers enjoy a meal in the camp cafeteria"
Ejemplo n.º 13
0
 def convertfile(storefile):
     store = pofile()
     # Fake input file with a blank filename
     htmlparser = htmlfile(includeuntaggeddata=False,
                           inputfile=BytesIOMode("", storefile.read()))
     for htmlunit in htmlparser.units:
         thepo = store.addsourceunit(htmlunit.source)
         thepo.addlocations(htmlunit.getlocations())
         thepo.addnote(htmlunit.getnotes(), "developer")
     store.removeduplicates("msgctxt")
     return store
Ejemplo n.º 14
0
    def test_extraction_pre_code():
        """Check that we can preserve lines in the <pre> tag"""
        h = html.htmlfile()
        store = h.parsestring("""
<pre><code>
this is
a multiline
pre tag
</code></pre>
        """)
        assert len(store.units) == 1
        assert store.units[0].source == "this is\na multiline\npre tag"
Ejemplo n.º 15
0
 def test_extraction_tag_figcaption(self):
     """Check that we can extract figcaption"""
     h = html.htmlfile()
     # Example form http://www.w3schools.com/tags/tag_figcaption.asp
     store = h.parsestring("""
            <figure>
                <img src="img_pulpit.jpg" alt="The Pulpit Rock" width="304" height="228">
                <figcaption>Fig1. - A view of the pulpit rock in Norway.</figcaption>
            </figure>""")
     print(store.units[0].source)
     assert len(store.units) == 2
     assert store.units[0].source == "The Pulpit Rock"
     assert store.units[1].source == "Fig1. - A view of the pulpit rock in Norway."
Ejemplo n.º 16
0
 def convertfile(self, inputfile, filename, includeuntagged=False,
                 duplicatestyle="msgctxt", keepcomments=False):
     """converts a html file to .po format"""
     thetargetfile = po.pofile()
     htmlparser = html.htmlfile(includeuntaggeddata=includeuntagged,
                                inputfile=inputfile)
     for htmlunit in htmlparser.units:
         thepo = thetargetfile.addsourceunit(htmlunit.source)
         thepo.addlocations(htmlunit.getlocations())
         if keepcomments:
             thepo.addnote(htmlunit.getnotes(), "developer")
     thetargetfile.removeduplicates(duplicatestyle)
     return thetargetfile
Ejemplo n.º 17
0
 def test_extraction_tag_figcaption(self):
     """Check that we can extract figcaption"""
     h = html.htmlfile()
     # Example form http://www.w3schools.com/tags/tag_figcaption.asp
     store = h.parsestring("""
            <figure>
                <img src="img_pulpit.jpg" alt="The Pulpit Rock" width="304" height="228">
                <figcaption>Fig1. - A view of the pulpit rock in Norway.</figcaption>
            </figure>""")
     print(store.units[0].source)
     assert len(store.units) == 2
     assert store.units[0].source == "The Pulpit Rock"
     assert store.units[1].source == "Fig1. - A view of the pulpit rock in Norway."
Ejemplo n.º 18
0
 def convertfile(storefile, template_store):
     store = pofile()
     # Fake input file with a blank filename
     htmlparser = htmlfile(inputfile=BytesIOMode("", storefile.read()))
     for htmlunit in htmlparser.units:
         locations = htmlunit.getlocations()
         if template_store:
             # Transalation
             template = template_store.find_unit_mono("".join(locations))
             if template is None:
                 # Skip locations not present in the source HTML file
                 continue
             # Create unit with matching source
             thepo = store.addsourceunit(template.source)
             thepo.target = htmlunit.source
         else:
             # Source file
             thepo = store.addsourceunit(htmlunit.source)
             thepo.target = htmlunit.source
         thepo.addlocations(htmlunit.getlocations())
         thepo.addnote(htmlunit.getnotes(), "developer")
     store.removeduplicates("msgctxt")
     return store
Ejemplo n.º 19
0
 def test_extraction_tag_caption_td_th(self):
     """Check that we can extract table related translatable: th, td and caption"""
     h = html.htmlfile()
     # Example form http://www.w3schools.com/tags/tag_caption.asp
     store = h.parsestring("""
         <table>
             <caption>Monthly savings</caption>
             <tr>
                 <th>Month</th>
                 <th>Savings</th>
             </tr>
             <tr>
                 <td>January</td>
                 <td>$100</td>
             </tr>
         </table>""")
     print(store.units[0].source)
     assert len(store.units) == 5
     assert store.units[0].source == "Monthly savings"
     assert store.units[1].source == "Month"
     assert store.units[2].source == "Savings"
     assert store.units[3].source == "January"
     assert store.units[4].source == "$100"
Ejemplo n.º 20
0
    def test_extraction_attr_title(self):
        """Check that we can extract title attribute"""
        h = html.htmlfile()

        # Example form http://www.w3schools.com/tags/att_global_title.asp
        store = h.parsestring("""
            <p><abbr title="World Health Organization">WHO</abbr> was founded in 1948.</p>
            <p title="Free Web tutorials">W3Schools.com</p>""")
        print(store.units[0].source)
        assert len(store.units) == 4
        assert store.units[0].source == "World Health Organization"
        # FIXME this is not ideal we need to either drop title= as we've
        # extracted it already or not extract it earlier
        assert store.units[
            1].source == '<abbr title="World Health Organization">WHO</abbr> was founded in 1948.'
        assert store.units[2].source == "Free Web tutorials"
        assert store.units[3].source == "W3Schools.com"

        # Example from http://www.netmechanic.com/news/vol6/html_no1.htm
        store = h.parsestring("""
            <table width="100" border="2" title="Henry Jacobs Camp summer 2003 schedule">
        """)
        assert len(store.units) == 1
        assert store.units[
            0].source == "Henry Jacobs Camp summer 2003 schedule"
        # FIXME this doesn't extract as I'd have expected
        #store = h.parsestring("""
        #    <a href="page1.html" title="HS Jacobs - a UAHC camp in Utica, MS">Henry S. Jacobs Camp</a>
        #""")
        #assert len(store.units) == 2
        #assert store.units[0].source == "HS Jacobs - a UAHC camp in Utica, MS"
        #assert store.units[1].source == "Henry S. Jacobs Camp"
        store = h.parsestring("""
            <form name="application" title="Henry Jacobs camper application" method="  " action="  ">
        """)
        assert len(store.units) == 1
        assert store.units[0].source == "Henry Jacobs camper application"
Ejemplo n.º 21
0
 def strip_html(self, str):
     h = html.htmlfile()
     store = h.parsestring(str)
     return "\n".join([u.source for u in store.units])
Ejemplo n.º 22
0
 def test_self_closing_tags(self):
     h = html.htmlfile()
     store = h.parsestring("<h3>Some text <img><br><img></h3>")
     assert len(store.units) == 1
Ejemplo n.º 23
0
def test_pi_escaping():
    h = html.htmlfile()
    assert h.pi_escape('<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">') == '<a href="<?=($a %lt; $b ? $foo : ($b %gt; c ? $bar : $cat))?>">'
Ejemplo n.º 24
0
def test_strip_html_with_pi():
    h = html.htmlfile()
    assert html.strip_html(h.pi_escape('<a href="<?$var?>">Something</a>')) == "Something"
    assert html.strip_html(h.pi_escape('<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">Something</a>')) == "Something"
Ejemplo n.º 25
0
def test_guess_encoding():
    """Read an encoding header to guess the encoding correctly"""
    h = html.htmlfile()
    assert h.guess_encoding('''<META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-8">''') == "UTF-8"
    assert h.guess_encoding('''<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"><html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><!-- base href="http://home.online.no/~rut-aane/linux.html" --><link rel="shortcut icon" href="http://home.online.no/~rut-aane/peng16x16a.gif"><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"><meta name="Description" content="Linux newbie stuff and a little about Watching TV under Linux"><meta name="MSSmartTagsPreventParsing" content="TRUE"><meta name="GENERATOR" content="Mozilla/4.7 [en] (X11; I; Linux 2.2.5-15 i586) [Netscape]"><title>Some Linux for beginners</title><style type="text/css">''') == "iso-8859-1"
Ejemplo n.º 26
0
def test_pi_escaping():
    h = html.htmlfile()
    assert h.pi_escape(
        '<a href="<?=($a < $b ? $foo : ($b > c ? $bar : $cat))?>">'
    ) == '<a href="<?=($a %lt; $b ? $foo : ($b %gt; c ? $bar : $cat))?>">'
Ejemplo n.º 27
0
 def test_self_closing_tags(self):
     h = html.htmlfile()
     store = h.parsestring("<h3>Some text <img><br><img></h3>")
     assert len(store.units) == 1