def find_date(symbol, date_type): """ Resolve birth and death dates from infoboxes, or, if it is not found, from the first paragraph """ for cls in InfoboxClassifier().classify(symbol): ibox_date = InfoboxResolver().resolve_infobox(cls, symbol, date_type) if ibox_date is not None: return ibox_date # TODO: look at categories for dates article = get_article(symbol) text = article.paragraphs()[0] # the first paragraph for s, e in iter_paren(text, "."): paren = text[s:e] for ovl in overlay_parse.dates.just_ranges(paren): if date_type == 'birth-date': return lispify(ovl[0], typecode='yyyymmdd') elif date_type == 'death-date': return lispify(ovl[1], typecode='yyyymmdd') # If there is just one date and we need a birth date, get that if date_type == 'birth-date': for ovl in overlay_parse.dates.just_dates(paren): return lispify(ovl, typecode='yyyymmdd')
def synonyms(self, symbol): synonyms = set() for si in self.synonym_inducers: synonyms.update(si.induce(symbol)) return lispify(synonyms)
def resolve_infobox(self, cls, symbol, attr): """ Return the value of the attribute for the article. """ if "\n" in symbol: # There are no newlines in article titles return None if isinstance(attr, LispType): typecode, attr = attr.typecode, attr.val else: typecode, attr = self._typecode, attr infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher) for ibox in infoboxes: result = ibox.get(attr) if result: self.log().info("Found infobox attribute '%s'" % attr) assert(isinstance(result, unicode)) # TODO: remove for production return lispify(result, typecode=typecode, infobox_attr=attr) self.log().warning("Could not find infobox attribute '%s'" % attr) self.log().warning("Could not resolve attribute '%s' for '%s' with " "class '%s'", attr, symbol, cls)
def url(self, article, _): """ Note that this url is the wikipedia.org url. NOT the place where we got the page. """ # Will also teake care of redirections. article = get_article(article) url = article.url() return lispify(url, typecode='url')
def short_article(self, symbol, _): """ The first paragraph of the article, or if the first paragraph is shorter than 350 characters, then returns the first paragraphs such that the sum of the rendered characters is at least 350. """ # TODO: check if the first paragraph is shorter than 350 characters first_paragraph = get_article(symbol).first_paragraph(keep_html=True) return lispify(first_paragraph, typecode='html')
def number(self, article, _): """ True if it is plural. """ # First paragraph refers more often to the symbol itself # rather than things related to it. txt = get_article(article).first_paragraph() nay = sum(map(txt.count, [' is ', ' was ', ' has '])) yay = sum(map(txt.count, [' are ', ' were ', ' have '])) # inequality because there are many more nays return lispify(yay > nay, typecode='calculated')
def proper(self, article, _): """ Get a quick boolean answer based on the symbol text and the article text. """ # Blindly copied by the ruby version a = re.sub(r"\s*\(.*\)\s*", "", article.replace("_", " ")) txt = totext(get_article(article).html_source()) ret = (txt.count(a.lower()) - txt.count(". " + a.lower()) < txt.count(a)) return lispify(ret, typecode='calculated')
def get_callable(self, symbol): """ Given a function name return the callable. Keywords should lispify the arguments. """ if isinstance(symbol, Symbol): return self.resources()[symbol._name] if isinstance(symbol, Keyword): return lambda *args: lispify(*args, typecode=symbol._name) raise TypeError("Could not resolve function %s (type %s)." % (symbol, str(type(symbol))))
def attributes(self, cls, symbol): """ Get all infobox attributes """ attributes = [] infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher) for ibox in infoboxes: for k, v in ibox.markup_parsed_iter(): rendered = ibox.rendered_attributes().get(k.replace('-', '_')) tmp = dict(code=k.upper(), rendered=rendered) attributes.append(tmp) return lispify(attributes)
def get(self, cls, symbol, attr): """ Gets the value of a symbol's attribute. :param cls: Wikipedia class of the symbol :param symbol: the Wikipedia article :param attr: the attribute to get :returns: the attribute's value or an error, lispified """ for ar in self.resolvers: res = ar.resolve(cls, symbol, attr) if res is not None: break return lispify([res])
def coordinates(self, article, _): for ibox in get_infoboxes(article): src = ibox.html_source() if src is None: return None xpath = ".//span[@id='coordinates']" lat = src.find(xpath + "//span[@class='latitude']") lon = src.find(xpath + "//span[@class='longitude']") if lat is None or lon is None: return None nlat = self._dton(totext(lat)) nlon = self._dton(totext(lon)) return lispify([nlat, nlon], typecode='coordinates')
def image(self, article, attribute): # Make sure we are not getting back a LispType. infoboxes = get_infoboxes(article) imgs = [ibx.get('image') for ibx in infoboxes] if not imgs: return None img = imgs[0] fnam = img.replace(" ", "_") if "File:" in img: fnam = fnam.split("File:")[1] # TODO : is this a temporary fix? investigate what this annotation means # see 'Bill Clinton' for an example if "{{!}}border" in img: fnam = fnam.split("{{!}}border")[0] caps = [ibx.get('caption') for ibx in infoboxes] caps = filter(lambda x: x, caps) # remove None values return lispify([0, fnam] + ([markup_unlink(caps[0])] if caps else []))
def test_date_with_range(self): # 2010 is in the given range, thus it will precede 8,8,1991 ed = lispify("2010 8.9.1991 - 2012 on August the 8th 1991", typecode="yyyymmdd") self.assertEqual(ed, '(:yyyymmdd 20100000)')
def test_date_multiple_voting(self): ed = lispify("2010 8.8.1991 on August the 8th 1991", typecode="yyyymmdd") self.assertEqual(ed, '(:yyyymmdd 20100000)')
def test_date_simple(self): ed = lispify("coming on August the 8th", typecode="yyyymmdd") self.assertEqual(ed, '(:yyyymmdd 00000808)')
def test_list_of_dict_with_typecode(self): l = [{'foo': 'bar'}, {'foo': 'baz'}] self.assertEqual(lispify(l, typecode='html'), '(:html (:foo "bar") (:foo "baz"))')
def test_list_of_dict(self): l = [{'foo': 'bar'}, {'foo': 'baz'}] self.assertEqual(lispify(l), '((:foo "bar") (:foo "baz"))')
def test_double_nested_list(self): l = [[0, ['v0', 'foo']], [1, ['v1', 'bar']]] self.assertEqual(lispify(l), '((0 ("v0" "foo")) (1 ("v1" "bar")))')
def test_dict_with_escaped_string(self): self.assertEqual(lispify({'a': 1, 'b': '"foo"'}), '(:a 1 :b "\\"foo\\"")')
def test_keyword_with_typecode(self): self.assertEqual(lispify(':feminine', typecode='calculated'), '(:calculated :feminine)')
def test_keyword(self): self.assertEqual(lispify(':feminine'), ":feminine")
def test_bool(self): self.assertEqual(lispify(True), 't') self.assertEqual(lispify(False), 'nil')
def test_bool_with_typecode(self): self.assertEqual(lispify(False, typecode='calculated'), '(:calculated nil)')
def test_string_with_typecode(self): self.assertEqual(lispify("bar", typecode="html"), '(:html "bar")')
def test_string_not_keyword(self): self.assertEqual(lispify(':not a keyword'), '":not a keyword"')
def test_list(self): l = ['wikipedia-class1', 'wikipedia-class2'] self.assertEqual(lispify(l), '("wikipedia-class1" "wikipedia-class2")')
def test_dict(self): self.assertEqual(lispify({'a': 1, 'b': "foo"}), '(:a 1 :b "foo")')
def test_list_with_typecode(self): l = [44, 35] self.assertEqual(lispify(l, typecode='coordinates'), '(:coordinates 44 35)')
def test_dict_with_list(self): self.assertEqual(lispify({'a': 1, 'b': ['foo', 'bar']}), '(:a 1 :b ("foo" "bar"))')
def test_nested_list(self): l = [[0, 'foo'], [1, '"bar"']] self.assertEqual(lispify(l), '((0 "foo") (1 "\\"bar\\""))')