def test_rendered_attributes(self): clinton = get_infoboxes("Winston Churchill", fetcher=self.fetcher)[0] self.assertEqual("Died", clinton.rendered_attributes().get("death_place")) bridge = get_infoboxes("Brooklyn Bridge", fetcher=self.fetcher)[0] self.assertEqual("Maintained by", bridge.rendered_attributes().get("maint"))
def resolve_infobox(self, cls, symbol, attr): """ Return the value of the attribute for the article. """ if "\n" in symbol: # There are no newlines in article titles return None if isinstance(attr, LispType): typecode, attr = attr.typecode, attr.val else: typecode, attr = self._typecode, attr infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher) for ibox in infoboxes: result = ibox.get(attr) if result: self.log().info("Found infobox attribute '%s'" % attr) assert(isinstance(result, unicode)) # TODO: remove for production return lispify(result, typecode=typecode, infobox_attr=attr) self.log().warning("Could not find infobox attribute '%s'" % attr) self.log().warning("Could not resolve attribute '%s' for '%s' with " "class '%s'", attr, symbol, cls)
def test_no_clashes_with_multiple_infoboxes(self): officeholder_ibox, martial_artist_ibox = get_infoboxes('Vladimir Putin') self.assertEqual(officeholder_ibox.wikipedia_class(), 'wikipedia-officeholder') self.assertEqual(martial_artist_ibox.wikipedia_class(), 'wikipedia-martial-artist') self.assertEqual(officeholder_ibox.get('image'), 'Vladimir Putin 12023 (cropped).jpg') self.assertEqual(martial_artist_ibox.get('image'), 'Vladimir Putin in Japan 3-5 September 2000-22.jpg')
def is_person(self, symbol): # TODO : test the precision of this method of determining is_person infoboxes = get_infoboxes(symbol) for ibx in infoboxes: if ibx.wikipedia_class() == 'wikipedia-person' or \ ibx.get('birth-date'): return True from wikipediabase.resolvers import PersonResolver if PersonResolver().birth_date(symbol, 'birth-date'): return True return False
def attributes(self, cls, symbol): """ Get all infobox attributes """ attributes = [] infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher) for ibox in infoboxes: for k, v in ibox.markup_parsed_iter(): rendered = ibox.rendered_attributes().get(k.replace('-', '_')) tmp = dict(code=k.upper(), rendered=rendered) attributes.append(tmp) return lispify(attributes)
def coordinates(self, article, _): for ibox in get_infoboxes(article): src = ibox.html_source() if src is None: return None xpath = ".//span[@id='coordinates']" lat = src.find(xpath + "//span[@class='latitude']") lon = src.find(xpath + "//span[@class='longitude']") if lat is None or lon is None: return None nlat = self._dton(totext(lat)) nlon = self._dton(totext(lon)) return lispify([nlat, nlon], typecode='coordinates')
def image(self, article, attribute): # Make sure we are not getting back a LispType. infoboxes = get_infoboxes(article) imgs = [ibx.get('image') for ibx in infoboxes] if not imgs: return None img = imgs[0] fnam = img.replace(" ", "_") if "File:" in img: fnam = fnam.split("File:")[1] # TODO : is this a temporary fix? investigate what this annotation means # see 'Bill Clinton' for an example if "{{!}}border" in img: fnam = fnam.split("{{!}}border")[0] caps = [ibx.get('caption') for ibx in infoboxes] caps = filter(lambda x: x, caps) # remove None values return lispify([0, fnam] + ([markup_unlink(caps[0])] if caps else []))
def test_infobox_html_parsed(self): ibox = get_infoboxes("AC/DC", fetcher=self.fetcher)[0] self.assertIn(("Origin", "Sydney, Australia"), ibox.html_parsed())
def test_infobox_html_raw(self): ibox = get_infoboxes("Led Zeppelin", fetcher=self.fetcher)[0] self.assertIn("Origin\nLondon, England", ibox.rendered())
def test_infobox_markup_raw(self): ibox = get_infoboxes("Winston Churchill", fetcher=self.fetcher)[0] self.assertIn("|death_place ", ibox.markup_source())
def test_markup(self): ibox = get_infoboxes("Led Zeppelin", fetcher=self.fetcher)[0] self.assertEqual(ibox.markup_source()[:9], "{{Infobox") self.assertIn("| name = Led Zeppelin", ibox.markup_source())
def test_infoboxes(self): c = InfoboxScraper(self.symbol) self.assertIs(list, type(util.get_infoboxes(self.symbol))) self.assertIs(Infobox, type(util.get_infoboxes(self.symbol)[0])) self.assertIs(list, type(util.get_infoboxes(c)))
def test_templates(self): infoboxes = get_infoboxes("Vladimir Putin", fetcher=self.fetcher) templates = ["Template:Infobox officeholder", "Template:Infobox martial artist"] self.assertItemsEqual(map(lambda i: i.template(), infoboxes), templates)
def test_types_redirect(self): ibox = get_infoboxes("Bill Clinton", fetcher=self.fetcher)[0] self.assertIn("president", ibox.types())
def classify(self, symbol, fetcher=None): classes = [] for ibox in get_infoboxes(symbol, fetcher=fetcher): classes.append(ibox.wikipedia_class()) return classes
def test_get(self): ibox = get_infoboxes("The Rolling Stones", fetcher=self.fetcher)[0] self.assertEqual(ibox.get("origin"), "London, England")
def test_attributes(self): ibox = get_infoboxes("Winston Churchill", fetcher=self.fetcher)[0] self.assertIn("death-place", [k for k, v in ibox.markup_parsed_iter()])
def test_html_attributes(self): ibox = get_infoboxes("BBC News", fetcher=self.fetcher)[0] self.assertEqual("Owners", ibox.rendered_attributes().get("owners"))
def test_classes(self): infoboxes = get_infoboxes("Vladimir Putin", fetcher=self.fetcher) classes = ["wikipedia-officeholder", "wikipedia-martial-artist"] self.assertItemsEqual(map(lambda i: i.wikipedia_class(), infoboxes), classes)
def infoboxes(self): if not self._infoboxes: self._infoboxes = get_infoboxes(self.title(), fetcher=self.fetcher) return self._infoboxes