Exemple #1
0
    def html_parsed(self):
        """
        Given the infobox html or as soup, return a list of (key, value)
        pairs.
        """

        def escape_lists(val):
            if not val:
                return u""

            return re.sub(
                r"<\s*(/?\s*(br\s*/?|/?ul|/?li))\s*>", "&lt;\\1&gt;", val)

        def unescape_lists(val):
            if not val:
                return u""

            val = re.sub(r"&lt;(/?\s*(br\s*/?|ul|li))&gt;", "<\\1>", val)
            return val

        soup = fromstring(self.html_source())
        # Render all tags except <ul> and <li> and <br>. Escape them
        # in some way and then reparse

        tpairs = []

        for row in soup.findall('.//tr'):
            try:
                e_key, e_val = row.findall('./*')[:2]
            except ValueError:
                continue

            if e_key is not None and e_val is not None:
                # Turn the key into xml string, parse the other tags
                # making brs into newlines, parse the rest of the
                # tags, get the text back
                key = totext(fromstring(tostring(e_key), True))
                key = re.sub(r"\s+", " ", key).strip()
                val = escape_lists(tostring(e_val))
                # Extract text
                val = fromstring(val)
                val = totext(val)
                val = unescape_lists(val.strip())
                tpairs.append((key, val))

        return tpairs
Exemple #2
0
    def paragraphs(self, keep_html=False):
        """
        Generate paragraphs.
        """

        xpath = ".//*[@id='mw-content-text']/p"
        if keep_html:
            return ["".join(tostring(p)) for p in self._soup().findall(xpath)
                    if "".join(p.itertext())]
        else:
            return ["".join(p.itertext()) for p in self._soup().findall(xpath)
                    if "".join(p.itertext())]
Exemple #3
0
 def test_html(self):
     html = "<html> <body><p>yes</p> <p> hi</p> <img/> </body> </html>"
     el = util.fromstring(html)
     self.assertEqual("yes  hi", util.totext(el).strip())
     self.assertIn("<p>", util.tostring(el))