Ejemplo n.º 1
0
    def _infoboxes_from_article(self, markup_source, html_source):
        markup_infoboxes, external_templates = self._markup_infoboxes(markup_source)
        html_infoboxes = self._html_infoboxes(html_source)

        assert(len(markup_infoboxes) <= len(html_infoboxes))  # TODO: remove for production

        if len(markup_infoboxes) != len(html_infoboxes):
            # hack/optimization: remove sidebar table about article series
            # for an example, see Barack Obama or JFK's article
            series_txt = 'This article is part of a series about'
            html_infoboxes = filter(lambda t: series_txt not in totext(t),
                                    html_infoboxes)

        if len(markup_infoboxes) != len(html_infoboxes):
            # filter out infobox-like tables that don't match the infobox markup
            # this operation is expensive. Try to find optimizations like above
            html_infoboxes = self._best_html_infoboxes(markup_infoboxes,
                                                       html_infoboxes)

        infoboxes = []
        for i, source in enumerate(markup_infoboxes):
            ibox = Infobox(self.symbol, source, html_infoboxes[i],
                           title=self.title)
            infoboxes.append(ibox)

        return infoboxes, external_templates
Ejemplo n.º 2
0
    def coordinates(self, article, _):
        for ibox in get_infoboxes(article):
            src = ibox.html_source()
            if src is None:
                return None

            xpath = ".//span[@id='coordinates']"
            lat = src.find(xpath + "//span[@class='latitude']")
            lon = src.find(xpath + "//span[@class='longitude']")

            if lat is None or lon is None:
                return None

            nlat = self._dton(totext(lat))
            nlon = self._dton(totext(lon))

            return lispify([nlat, nlon], typecode='coordinates')
Ejemplo n.º 3
0
    def html_parsed(self):
        """
        Given the infobox html or as soup, return a list of (key, value)
        pairs.
        """

        def escape_lists(val):
            if not val:
                return u""

            return re.sub(
                r"<\s*(/?\s*(br\s*/?|/?ul|/?li))\s*>", "&lt;\\1&gt;", val)

        def unescape_lists(val):
            if not val:
                return u""

            val = re.sub(r"&lt;(/?\s*(br\s*/?|ul|li))&gt;", "<\\1>", val)
            return val

        soup = fromstring(self.html_source())
        # Render all tags except <ul> and <li> and <br>. Escape them
        # in some way and then reparse

        tpairs = []

        for row in soup.findall('.//tr'):
            try:
                e_key, e_val = row.findall('./*')[:2]
            except ValueError:
                continue

            if e_key is not None and e_val is not None:
                # Turn the key into xml string, parse the other tags
                # making brs into newlines, parse the rest of the
                # tags, get the text back
                key = totext(fromstring(tostring(e_key), True))
                key = re.sub(r"\s+", " ", key).strip()
                val = escape_lists(tostring(e_val))
                # Extract text
                val = fromstring(val)
                val = totext(val)
                val = unescape_lists(val.strip())
                tpairs.append((key, val))

        return tpairs
Ejemplo n.º 4
0
    def proper(self, article, _):
        """
        Get a quick boolean answer based on the symbol text and the
        article text.
        """

        # Blindly copied by the ruby version
        a = re.sub(r"\s*\(.*\)\s*", "", article.replace("_", " "))
        txt = totext(get_article(article).html_source())
        ret = (txt.count(a.lower()) - txt.count(". " + a.lower()) <
               txt.count(a))

        return lispify(ret, typecode='calculated')
Ejemplo n.º 5
0
    def title(self):
        """
        The title after redirections and stuff.
        """
        # Warning!! dont feed this to the fetcher. This depends on the
        # fetcher to resolve redirects and a cirular recursion will
        # occur

        heading = self._soup().get_element_by_id('firstHeading')
        if heading is not None:
            return totext(heading).strip()

        raise Exception("No title found for '%s'" % self.symbol())
Ejemplo n.º 6
0
    def _best_html_infoboxes(self, markup, html):
        """
        Given n markup infoboxes and n+m infobox-like html tables
        returns a list of n best candidates for html infoboxes
        """
        n = len(markup)
        m = len(html) - n
        pos = 0
        infoboxes = []

        for ibox in markup:
            choices = html[pos:pos + m]
            best_match, score = process.extractOne(totext(ibox),
                                                   choices,
                                                   processor=totext,
                                                   scorer=fuzz.token_set_ratio)

            infoboxes.append(best_match)
            pos = html.index(best_match) + 1

        return infoboxes
Ejemplo n.º 7
0
 def rendered(self):
     return totext(self.html_source())
Ejemplo n.º 8
0
 def test_html(self):
     html = "<html> <body><p>yes</p> <p> hi</p> <img/> </body> </html>"
     el = util.fromstring(html)
     self.assertEqual("yes  hi", util.totext(el).strip())
     self.assertIn("<p>", util.tostring(el))
Ejemplo n.º 9
0
 def test_fromstringtotext(self):
     self.assertEqual(util.totext(util.fromstring("hello<br/>")), "hello")
     self.assertEqual(util.totext(util.fromstring("<br/>", True)), "\n")