def _infoboxes_from_article(self, markup_source, html_source): markup_infoboxes, external_templates = self._markup_infoboxes(markup_source) html_infoboxes = self._html_infoboxes(html_source) assert(len(markup_infoboxes) <= len(html_infoboxes)) # TODO: remove for production if len(markup_infoboxes) != len(html_infoboxes): # hack/optimization: remove sidebar table about article series # for an example, see Barack Obama or JFK's article series_txt = 'This article is part of a series about' html_infoboxes = filter(lambda t: series_txt not in totext(t), html_infoboxes) if len(markup_infoboxes) != len(html_infoboxes): # filter out infobox-like tables that don't match the infobox markup # this operation is expensive. Try to find optimizations like above html_infoboxes = self._best_html_infoboxes(markup_infoboxes, html_infoboxes) infoboxes = [] for i, source in enumerate(markup_infoboxes): ibox = Infobox(self.symbol, source, html_infoboxes[i], title=self.title) infoboxes.append(ibox) return infoboxes, external_templates
def coordinates(self, article, _): for ibox in get_infoboxes(article): src = ibox.html_source() if src is None: return None xpath = ".//span[@id='coordinates']" lat = src.find(xpath + "//span[@class='latitude']") lon = src.find(xpath + "//span[@class='longitude']") if lat is None or lon is None: return None nlat = self._dton(totext(lat)) nlon = self._dton(totext(lon)) return lispify([nlat, nlon], typecode='coordinates')
def html_parsed(self): """ Given the infobox html or as soup, return a list of (key, value) pairs. """ def escape_lists(val): if not val: return u"" return re.sub( r"<\s*(/?\s*(br\s*/?|/?ul|/?li))\s*>", "<\\1>", val) def unescape_lists(val): if not val: return u"" val = re.sub(r"<(/?\s*(br\s*/?|ul|li))>", "<\\1>", val) return val soup = fromstring(self.html_source()) # Render all tags except <ul> and <li> and <br>. Escape them # in some way and then reparse tpairs = [] for row in soup.findall('.//tr'): try: e_key, e_val = row.findall('./*')[:2] except ValueError: continue if e_key is not None and e_val is not None: # Turn the key into xml string, parse the other tags # making brs into newlines, parse the rest of the # tags, get the text back key = totext(fromstring(tostring(e_key), True)) key = re.sub(r"\s+", " ", key).strip() val = escape_lists(tostring(e_val)) # Extract text val = fromstring(val) val = totext(val) val = unescape_lists(val.strip()) tpairs.append((key, val)) return tpairs
def proper(self, article, _): """ Get a quick boolean answer based on the symbol text and the article text. """ # Blindly copied by the ruby version a = re.sub(r"\s*\(.*\)\s*", "", article.replace("_", " ")) txt = totext(get_article(article).html_source()) ret = (txt.count(a.lower()) - txt.count(". " + a.lower()) < txt.count(a)) return lispify(ret, typecode='calculated')
def title(self): """ The title after redirections and stuff. """ # Warning!! dont feed this to the fetcher. This depends on the # fetcher to resolve redirects and a cirular recursion will # occur heading = self._soup().get_element_by_id('firstHeading') if heading is not None: return totext(heading).strip() raise Exception("No title found for '%s'" % self.symbol())
def _best_html_infoboxes(self, markup, html): """ Given n markup infoboxes and n+m infobox-like html tables returns a list of n best candidates for html infoboxes """ n = len(markup) m = len(html) - n pos = 0 infoboxes = [] for ibox in markup: choices = html[pos:pos + m] best_match, score = process.extractOne(totext(ibox), choices, processor=totext, scorer=fuzz.token_set_ratio) infoboxes.append(best_match) pos = html.index(best_match) + 1 return infoboxes
def rendered(self): return totext(self.html_source())
def test_html(self): html = "<html> <body><p>yes</p> <p> hi</p> <img/> </body> </html>" el = util.fromstring(html) self.assertEqual("yes hi", util.totext(el).strip()) self.assertIn("<p>", util.tostring(el))
def test_fromstringtotext(self): self.assertEqual(util.totext(util.fromstring("hello<br/>")), "hello") self.assertEqual(util.totext(util.fromstring("<br/>", True)), "\n")