Beispiel #1
0
def find_date(symbol, date_type):
    """
    Resolve birth and death dates from infoboxes, or, if it is not found,
    from the first paragraph
    """
    for cls in InfoboxClassifier().classify(symbol):
        ibox_date = InfoboxResolver().resolve_infobox(cls, symbol, date_type)
        if ibox_date is not None:
            return ibox_date

    # TODO: look at categories for dates

    article = get_article(symbol)
    text = article.paragraphs()[0]  # the first paragraph
    for s, e in iter_paren(text, "."):
        paren = text[s:e]

        for ovl in overlay_parse.dates.just_ranges(paren):
            if date_type == 'birth-date':
                return lispify(ovl[0], typecode='yyyymmdd')
            elif date_type == 'death-date':
                return lispify(ovl[1], typecode='yyyymmdd')

        # If there is just one date and we need a birth date, get that
        if date_type == 'birth-date':
            for ovl in overlay_parse.dates.just_dates(paren):
                return lispify(ovl, typecode='yyyymmdd')
    def synonyms(self, symbol):
        synonyms = set()

        for si in self.synonym_inducers:
            synonyms.update(si.induce(symbol))

        return lispify(synonyms)
Beispiel #3
0
    def resolve_infobox(self, cls, symbol, attr):
        """
        Return the value of the attribute for the article.
        """

        if "\n" in symbol:
            # There are no newlines in article titles
            return None

        if isinstance(attr, LispType):
            typecode, attr = attr.typecode, attr.val
        else:
            typecode, attr = self._typecode, attr

        infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher)

        for ibox in infoboxes:
            result = ibox.get(attr)
            if result:
                self.log().info("Found infobox attribute '%s'" % attr)
                assert(isinstance(result, unicode))  # TODO: remove for production

                return lispify(result, typecode=typecode, infobox_attr=attr)

            self.log().warning("Could not find infobox attribute '%s'" % attr)

        self.log().warning("Could not resolve attribute '%s' for '%s' with "
                           "class '%s'", attr, symbol, cls)
Beispiel #4
0
 def url(self, article, _):
     """
     Note that this url is the wikipedia.org url. NOT the place where
     we got the page.
     """
     # Will also teake care of redirections.
     article = get_article(article)
     url = article.url()
     return lispify(url, typecode='url')
Beispiel #5
0
    def short_article(self, symbol, _):
        """
        The first paragraph of the article, or if the first paragraph is
        shorter than 350 characters, then returns the first paragraphs such
        that the sum of the rendered characters is at least 350.
        """

        # TODO: check if the first paragraph is shorter than 350 characters
        first_paragraph = get_article(symbol).first_paragraph(keep_html=True)
        return lispify(first_paragraph, typecode='html')
Beispiel #6
0
    def number(self, article, _):
        """
        True if it is plural.
        """
        # First paragraph refers more often to the symbol itself
        # rather than things related to it.
        txt = get_article(article).first_paragraph()

        nay = sum(map(txt.count, [' is ', ' was ', ' has ']))
        yay = sum(map(txt.count, [' are ', ' were ', ' have ']))

        # inequality because there are many more nays
        return lispify(yay > nay, typecode='calculated')
Beispiel #7
0
    def proper(self, article, _):
        """
        Get a quick boolean answer based on the symbol text and the
        article text.
        """

        # Blindly copied by the ruby version
        a = re.sub(r"\s*\(.*\)\s*", "", article.replace("_", " "))
        txt = totext(get_article(article).html_source())
        ret = (txt.count(a.lower()) - txt.count(". " + a.lower()) <
               txt.count(a))

        return lispify(ret, typecode='calculated')
Beispiel #8
0
    def get_callable(self, symbol):
        """
        Given a function name return the callable. Keywords should lispify
        the arguments.
        """

        if isinstance(symbol, Symbol):
            return self.resources()[symbol._name]

        if isinstance(symbol, Keyword):
            return lambda *args: lispify(*args, typecode=symbol._name)

        raise TypeError("Could not resolve function %s (type %s)."
                        % (symbol, str(type(symbol))))
Beispiel #9
0
    def attributes(self, cls, symbol):
        """
        Get all infobox attributes
        """

        attributes = []
        infoboxes = get_infoboxes(symbol, cls=cls, fetcher=self.fetcher)

        for ibox in infoboxes:
            for k, v in ibox.markup_parsed_iter():
                rendered = ibox.rendered_attributes().get(k.replace('-', '_'))
                tmp = dict(code=k.upper(), rendered=rendered)
                attributes.append(tmp)

        return lispify(attributes)
    def get(self, cls, symbol, attr):
        """
        Gets the value of a symbol's attribute.

        :param cls: Wikipedia class of the symbol
        :param symbol: the Wikipedia article
        :param attr: the attribute to get
        :returns: the attribute's value or an error, lispified
        """
        for ar in self.resolvers:
            res = ar.resolve(cls, symbol, attr)
            if res is not None:
                break

        return lispify([res])
Beispiel #11
0
    def coordinates(self, article, _):
        for ibox in get_infoboxes(article):
            src = ibox.html_source()
            if src is None:
                return None

            xpath = ".//span[@id='coordinates']"
            lat = src.find(xpath + "//span[@class='latitude']")
            lon = src.find(xpath + "//span[@class='longitude']")

            if lat is None or lon is None:
                return None

            nlat = self._dton(totext(lat))
            nlon = self._dton(totext(lon))

            return lispify([nlat, nlon], typecode='coordinates')
Beispiel #12
0
    def image(self, article, attribute):
        # Make sure we are not getting back a LispType.

        infoboxes = get_infoboxes(article)
        imgs = [ibx.get('image') for ibx in infoboxes]
        if not imgs:
            return None

        img = imgs[0]
        fnam = img.replace(" ", "_")
        if "File:" in img:
            fnam = fnam.split("File:")[1]

        # TODO : is this a temporary fix? investigate what this annotation means
        # see 'Bill Clinton' for an example
        if "{{!}}border" in img:
            fnam = fnam.split("{{!}}border")[0]

        caps = [ibx.get('caption') for ibx in infoboxes]
        caps = filter(lambda x: x, caps)  # remove None values
        return lispify([0, fnam] + ([markup_unlink(caps[0])] if caps else []))
Beispiel #13
0
 def test_date_with_range(self):
     # 2010 is in the given range, thus it will precede 8,8,1991
     ed = lispify("2010 8.9.1991 - 2012 on August the 8th 1991",
                  typecode="yyyymmdd")
     self.assertEqual(ed, '(:yyyymmdd 20100000)')
Beispiel #14
0
 def test_date_multiple_voting(self):
     ed = lispify("2010 8.8.1991 on August the 8th 1991",
                  typecode="yyyymmdd")
     self.assertEqual(ed, '(:yyyymmdd 20100000)')
Beispiel #15
0
 def test_date_simple(self):
     ed = lispify("coming on August the 8th", typecode="yyyymmdd")
     self.assertEqual(ed, '(:yyyymmdd 00000808)')
Beispiel #16
0
 def test_list_of_dict_with_typecode(self):
     l = [{'foo': 'bar'}, {'foo': 'baz'}]
     self.assertEqual(lispify(l, typecode='html'),
                      '(:html (:foo "bar") (:foo "baz"))')
Beispiel #17
0
 def test_list_of_dict(self):
     l = [{'foo': 'bar'}, {'foo': 'baz'}]
     self.assertEqual(lispify(l), '((:foo "bar") (:foo "baz"))')
Beispiel #18
0
 def test_double_nested_list(self):
     l = [[0, ['v0', 'foo']], [1, ['v1', 'bar']]]
     self.assertEqual(lispify(l), '((0 ("v0" "foo")) (1 ("v1" "bar")))')
Beispiel #19
0
 def test_dict_with_escaped_string(self):
     self.assertEqual(lispify({'a': 1, 'b': '"foo"'}),
                      '(:a 1 :b "\\"foo\\"")')
Beispiel #20
0
 def test_keyword_with_typecode(self):
     self.assertEqual(lispify(':feminine', typecode='calculated'),
                      '(:calculated :feminine)')
Beispiel #21
0
 def test_keyword(self):
     self.assertEqual(lispify(':feminine'), ":feminine")
Beispiel #22
0
 def test_bool(self):
     self.assertEqual(lispify(True), 't')
     self.assertEqual(lispify(False), 'nil')
Beispiel #23
0
 def test_bool_with_typecode(self):
     self.assertEqual(lispify(False, typecode='calculated'),
                      '(:calculated nil)')
Beispiel #24
0
 def test_string_with_typecode(self):
     self.assertEqual(lispify("bar", typecode="html"), '(:html "bar")')
Beispiel #25
0
 def test_string_not_keyword(self):
     self.assertEqual(lispify(':not a keyword'), '":not a keyword"')
Beispiel #26
0
 def test_list(self):
     l = ['wikipedia-class1', 'wikipedia-class2']
     self.assertEqual(lispify(l), '("wikipedia-class1" "wikipedia-class2")')
Beispiel #27
0
 def test_dict(self):
     self.assertEqual(lispify({'a': 1, 'b': "foo"}),
                      '(:a 1 :b "foo")')
Beispiel #28
0
 def test_list_with_typecode(self):
     l = [44, 35]
     self.assertEqual(lispify(l, typecode='coordinates'),
                      '(:coordinates 44 35)')
Beispiel #29
0
 def test_dict_with_list(self):
     self.assertEqual(lispify({'a': 1, 'b': ['foo', 'bar']}),
                      '(:a 1 :b ("foo" "bar"))')
Beispiel #30
0
 def test_nested_list(self):
     l = [[0, 'foo'], [1, '"bar"']]
     self.assertEqual(lispify(l), '((0 "foo") (1 "\\"bar\\""))')