Esempio n. 1
0
    def html_parsed(self):
        """
        Given the infobox html or as soup, return a list of (key, value)
        pairs.
        """

        def escape_lists(val):
            if not val:
                return u""

            return re.sub(
                r"<\s*(/?\s*(br\s*/?|/?ul|/?li))\s*>", "&lt;\\1&gt;", val)

        def unescape_lists(val):
            if not val:
                return u""

            val = re.sub(r"&lt;(/?\s*(br\s*/?|ul|li))&gt;", "<\\1>", val)
            return val

        soup = fromstring(self.html_source())
        # Render all tags except <ul> and <li> and <br>. Escape them
        # in some way and then reparse

        tpairs = []

        for row in soup.findall('.//tr'):
            try:
                e_key, e_val = row.findall('./*')[:2]
            except ValueError:
                continue

            if e_key is not None and e_val is not None:
                # Turn the key into xml string, parse the other tags
                # making brs into newlines, parse the rest of the
                # tags, get the text back
                key = totext(fromstring(tostring(e_key), True))
                key = re.sub(r"\s+", " ", key).strip()
                val = escape_lists(tostring(e_val))
                # Extract text
                val = fromstring(val)
                val = totext(val)
                val = unescape_lists(val.strip())
                tpairs.append((key, val))

        return tpairs
Esempio n. 2
0
    def _html_infoboxes(self, html):
        """
        A list of rendered infobox-like tables.

        We find rendered infoboxes by looking for a <table> element with
        class "infobox"

        Unfortunately, non-infobox tables such as sidebars might also match this
        criteria. Until Wikipedia uses a CSS class specifically for infoboxes
        we don't have a better way of selecting them.
        """

        bs = fromstring(html)
        return [t for t in bs.findall(".//table")
                if 'infobox' in t.get('class', '')]
Esempio n. 3
0
 def test_fromstringtotext(self):
     self.assertEqual(util.totext(util.fromstring("hello<br/>")), "hello")
     self.assertEqual(util.totext(util.fromstring("<br/>", True)), "\n")
Esempio n. 4
0
 def test_html(self):
     html = "<html> <body><p>yes</p> <p> hi</p> <img/> </body> </html>"
     el = util.fromstring(html)
     self.assertEqual("yes  hi", util.totext(el).strip())
     self.assertIn("<p>", util.tostring(el))
Esempio n. 5
0
    def _soup(self):
        if not hasattr(self, '__soup'):
            self.__soup = fromstring(self.html_source())

        return self.__soup