Example #1
0
    def test_rendered_attributes(self):
        clinton = get_infoboxes("Winston Churchill", fetcher=self.fetcher)[0]
        self.assertEqual("Died",
                         clinton.rendered_attributes().get("death_place"))

        bridge = get_infoboxes("Brooklyn Bridge", fetcher=self.fetcher)[0]
        self.assertEqual("Maintained by",
                         bridge.rendered_attributes().get("maint"))
Example #2
0
    def resolve_infobox(self, cls, symbol, attr):
        """
        Return the value of the attribute for the article.
        """

        if "\n" in symbol:
            # There are no newlines in article titles
            return None

        if isinstance(attr, LispType):
            typecode, attr = attr.typecode, attr.val
        else:
            typecode, attr = self._typecode, attr

        infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher)

        for ibox in infoboxes:
            result = ibox.get(attr)
            if result:
                self.log().info("Found infobox attribute '%s'" % attr)
                assert(isinstance(result, unicode))  # TODO: remove for production

                return lispify(result, typecode=typecode, infobox_attr=attr)

            self.log().warning("Could not find infobox attribute '%s'" % attr)

        self.log().warning("Could not resolve attribute '%s' for '%s' with "
                           "class '%s'", attr, symbol, cls)
Example #3
0
 def test_no_clashes_with_multiple_infoboxes(self):
     officeholder_ibox, martial_artist_ibox = get_infoboxes('Vladimir Putin')
     self.assertEqual(officeholder_ibox.wikipedia_class(),
                      'wikipedia-officeholder')
     self.assertEqual(martial_artist_ibox.wikipedia_class(),
                      'wikipedia-martial-artist')
     self.assertEqual(officeholder_ibox.get('image'),
                      'Vladimir Putin 12023 (cropped).jpg')
     self.assertEqual(martial_artist_ibox.get('image'),
                      'Vladimir Putin in Japan 3-5 September 2000-22.jpg')
Example #4
0
    def is_person(self, symbol):
        # TODO : test the precision of this method of determining is_person
        infoboxes = get_infoboxes(symbol)
        for ibx in infoboxes:
            if ibx.wikipedia_class() == 'wikipedia-person' or \
                    ibx.get('birth-date'):
                return True

        from wikipediabase.resolvers import PersonResolver
        if PersonResolver().birth_date(symbol, 'birth-date'):
            return True

        return False
Example #5
0
    def attributes(self, cls, symbol):
        """
        Get all infobox attributes
        """

        attributes = []
        infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher)

        for ibox in infoboxes:
            for k, v in ibox.markup_parsed_iter():
                rendered = ibox.rendered_attributes().get(k.replace('-', '_'))
                tmp = dict(code=k.upper(), rendered=rendered)
                attributes.append(tmp)

        return lispify(attributes)
Example #6
0
    def coordinates(self, article, _):
        for ibox in get_infoboxes(article):
            src = ibox.html_source()
            if src is None:
                return None

            xpath = ".//span[@id='coordinates']"
            lat = src.find(xpath + "//span[@class='latitude']")
            lon = src.find(xpath + "//span[@class='longitude']")

            if lat is None or lon is None:
                return None

            nlat = self._dton(totext(lat))
            nlon = self._dton(totext(lon))

            return lispify([nlat, nlon], typecode='coordinates')
Example #7
0
    def image(self, article, attribute):
        # Make sure we are not getting back a LispType.

        infoboxes = get_infoboxes(article)
        imgs = [ibx.get('image') for ibx in infoboxes]
        if not imgs:
            return None

        img = imgs[0]
        fnam = img.replace(" ", "_")
        if "File:" in img:
            fnam = fnam.split("File:")[1]

        # TODO : is this a temporary fix? investigate what this annotation means
        # see 'Bill Clinton' for an example
        if "{{!}}border" in img:
            fnam = fnam.split("{{!}}border")[0]

        caps = [ibx.get('caption') for ibx in infoboxes]
        caps = filter(lambda x: x, caps)  # remove None values
        return lispify([0, fnam] + ([markup_unlink(caps[0])] if caps else []))
Example #8
0
 def test_infobox_html_parsed(self):
     ibox = get_infoboxes("AC/DC", fetcher=self.fetcher)[0]
     self.assertIn(("Origin", "Sydney, Australia"), ibox.html_parsed())
Example #9
0
 def test_infobox_html_raw(self):
     ibox = get_infoboxes("Led Zeppelin", fetcher=self.fetcher)[0]
     self.assertIn("Origin\nLondon, England", ibox.rendered())
Example #10
0
 def test_infobox_markup_raw(self):
     ibox = get_infoboxes("Winston Churchill", fetcher=self.fetcher)[0]
     self.assertIn("|death_place ", ibox.markup_source())
Example #11
0
 def test_markup(self):
     ibox = get_infoboxes("Led Zeppelin", fetcher=self.fetcher)[0]
     self.assertEqual(ibox.markup_source()[:9], "{{Infobox")
     self.assertIn("| name = Led Zeppelin", ibox.markup_source())
Example #12
0
 def test_infoboxes(self):
     c = InfoboxScraper(self.symbol)
     self.assertIs(list, type(util.get_infoboxes(self.symbol)))
     self.assertIs(Infobox, type(util.get_infoboxes(self.symbol)[0]))
     self.assertIs(list, type(util.get_infoboxes(c)))
Example #13
0
 def test_templates(self):
     infoboxes = get_infoboxes("Vladimir Putin", fetcher=self.fetcher)
     templates = ["Template:Infobox officeholder",
                  "Template:Infobox martial artist"]
     self.assertItemsEqual(map(lambda i: i.template(), infoboxes),
                           templates)
Example #14
0
 def test_types_redirect(self):
     ibox = get_infoboxes("Bill Clinton", fetcher=self.fetcher)[0]
     self.assertIn("president", ibox.types())
Example #15
0
 def classify(self, symbol, fetcher=None):
     classes = []
     for ibox in get_infoboxes(symbol, fetcher=fetcher):
         classes.append(ibox.wikipedia_class())
     return classes
Example #16
0
 def test_get(self):
     ibox = get_infoboxes("The Rolling Stones", fetcher=self.fetcher)[0]
     self.assertEqual(ibox.get("origin"), "London, England")
Example #17
0
 def test_attributes(self):
     ibox = get_infoboxes("Winston Churchill", fetcher=self.fetcher)[0]
     self.assertIn("death-place",
                   [k for k, v in ibox.markup_parsed_iter()])
Example #18
0
 def test_html_attributes(self):
     ibox = get_infoboxes("BBC News", fetcher=self.fetcher)[0]
     self.assertEqual("Owners", ibox.rendered_attributes().get("owners"))
Example #19
0
 def test_classes(self):
     infoboxes = get_infoboxes("Vladimir Putin", fetcher=self.fetcher)
     classes = ["wikipedia-officeholder", "wikipedia-martial-artist"]
     self.assertItemsEqual(map(lambda i: i.wikipedia_class(), infoboxes),
                           classes)
Example #20
0
    def infoboxes(self):
        if not self._infoboxes:
            self._infoboxes = get_infoboxes(self.title(), fetcher=self.fetcher)

        return self._infoboxes