def html_parsed(self): """ Given the infobox html or as soup, return a list of (key, value) pairs. """ def escape_lists(val): if not val: return u"" return re.sub( r"<\s*(/?\s*(br\s*/?|/?ul|/?li))\s*>", "<\\1>", val) def unescape_lists(val): if not val: return u"" val = re.sub(r"<(/?\s*(br\s*/?|ul|li))>", "<\\1>", val) return val soup = fromstring(self.html_source()) # Render all tags except <ul> and <li> and <br>. Escape them # in some way and then reparse tpairs = [] for row in soup.findall('.//tr'): try: e_key, e_val = row.findall('./*')[:2] except ValueError: continue if e_key is not None and e_val is not None: # Turn the key into xml string, parse the other tags # making brs into newlines, parse the rest of the # tags, get the text back key = totext(fromstring(tostring(e_key), True)) key = re.sub(r"\s+", " ", key).strip() val = escape_lists(tostring(e_val)) # Extract text val = fromstring(val) val = totext(val) val = unescape_lists(val.strip()) tpairs.append((key, val)) return tpairs
def paragraphs(self, keep_html=False): """ Generate paragraphs. """ xpath = ".//*[@id='mw-content-text']/p" if keep_html: return ["".join(tostring(p)) for p in self._soup().findall(xpath) if "".join(p.itertext())] else: return ["".join(p.itertext()) for p in self._soup().findall(xpath) if "".join(p.itertext())]
def test_html(self): html = "<html> <body><p>yes</p> <p> hi</p> <img/> </body> </html>" el = util.fromstring(html) self.assertEqual("yes hi", util.totext(el).strip()) self.assertIn("<p>", util.tostring(el))