Example #1
0
class WikipediaCategories(WikipediaQuery):

    PARAMETERS = override_dict(
        WikipediaQuery.PARAMETERS,
        {
            "cllimit": 500,
            "clshow":
            "",  # gets set at runtime through "wiki_show_categories" config with "!hidden" by default
            "prop":
            "categories"  # generator style: info|pageprops|categoryinfo
        })

    def variables(self, *args):
        show = args[3] if len(args) > 3 else self.PARAMETERS["clshow"]
        variables = super(WikipediaCategories, self).variables(*args[:3])
        variables["show"] = show
        return variables

    def parameters(self, **kwargs):
        params = dict(self.PARAMETERS)
        params["clshow"] = self.config.wiki_show_categories
        return params

    class Meta:
        verbose_name = "Wikipedia category"
        verbose_name_plural = "Wikipedia categories"
Example #2
0
class WikipediaCategoryMembers(WikipediaGenerator):

    PARAMETERS = override_dict(WikipediaGenerator.PARAMETERS, {
        "generator": "categorymembers",
        "gcmlimit": 500,
        "gcmnamespace": 0,
        "prop": "info"
    })

    WIKI_QUERY_PARAM = "gcmtitle"
Example #3
0
class WikipediaTransclusions(WikipediaGenerator):

    PARAMETERS = override_dict(WikipediaGenerator.PARAMETERS, {
        "generator": "transcludedin",
        "gtishow": "!redirect",
        "gtilimit": 500
    })

    class Meta:
        verbose_name = "Wikipedia transclusions"
        verbose_name_plural = "Wikipedia transclusions"
Example #4
0
class WikipediaRevisions(WikipediaPage):

    PARAMETERS = override_dict(
        WikipediaPage.PARAMETERS, {
            "prop": "revisions",
            "rvlimit": 500,
            "rvprop": "content|user|timestamp",
            "rvdir": "older"
        })

    class Meta:
        verbose_name = "Wikipedia revisions"
        verbose_name_plural = "Wikipedia revisions"
Example #5
0
class WikipediaCategoryMembers(WikipediaGenerator):

    PARAMETERS = override_dict(
        WikipediaGenerator.PARAMETERS, {
            "generator": "categorymembers",
            "gcmlimit": 100,
            "gcmnamespace": 0,
            "prop": "info|pageprops|categories",
            "clshow": "!hidden",
            "cllimit": 500,
        })

    WIKI_QUERY_PARAM = "gcmtitle"

    class Meta:
        verbose_name = "Wikipedia category members"
        verbose_name_plural = "Wikipedia category members"
Example #6
0
class WikipediaSearch(WikipediaQuery, WikipediaImagesMixin):

    URI_TEMPLATE = 'http://{}.wikipedia.org/w/api.php?{}={}'
    PARAMETERS = override_dict(
        WikipediaQuery.PARAMETERS, {
            "prop": "info|pageprops|extracts|categories",
            "exintro": 1,
            "clshow": "!hidden",
            "cllimit": 500
        })
    GET_SCHEMA = {
        "args": {
            "type": "array",
            "items": [{
                "type": "string"
            }],
            "minItems": 3,
            "maxItems": 3
        },
        "kwargs": None
    }

    def parameters(self, **kwargs):
        parameters = copy(self.PARAMETERS)
        parameters["exintro"] = int(not self.config.wiki_full_extracts)
        return parameters

    def handle_errors(self):
        """
        Handle ambiguity errors
        """
        response = super(WikipediaSearch, self).handle_errors()
        if isinstance(response, dict):
            for page_id, page in six.iteritems(response):
                try:
                    if "disambiguation" in page['pageprops']:
                        self.status = 300
                        raise DSHttpWarning300("The search is ambiguous.",
                                               resource=self)
                except KeyError:
                    pass

    class Meta:
        verbose_name = "Wikipedia search"
        verbose_name_plural = "Wikipedia searches"
Example #7
0
class WikipediaTranslate(WikipediaPage):

    URI_TEMPLATE = 'http://{}.wiktionary.org/w/api.php?{}={}&iwprefix={}'  # updated at runtime

    PARAMETERS = override_dict(WikipediaPage.PARAMETERS, {
        'prop': 'info|pageprops|iwlinks',
        'iwprop': 'url',
    })

    def handle_errors(self):
        super(WikipediaTranslate, self).handle_errors()
        if not "iwlinks" in self.body:
            self.status = 404
            raise DGHttpError40X(
                "No translations found for {} in {}".format(*self.meta),
                resource=self)

    @property
    def meta(self):
        try:
            return self.request["args"][2], self.request["args"][3]
        except (KeyError, IndexError, TypeError):
            return None, None
Example #8
0
class WikipediaListPages(WikipediaQuery, WikipediaImagesMixin):

    PARAMETERS = override_dict(WikipediaQuery.PARAMETERS, {
        "prop": "info|pageprops|categories",
        "clshow": "!hidden",
        "cllimit": 500
    })
    GET_SCHEMA = {
        "args": {
            "type": "array",
            "items": [{
                "type": "string"
            }],  # TODO: validate with pattern?
            "minItems": 3,
            "maxItems": 3
        },
        "kwargs": None
    }
    WIKI_QUERY_PARAM = "pageids"

    class Meta:
        verbose_name = "Wikipedia list pages"
        verbose_name_plural = "Wikipedia list pages"
Example #9
0
class WikipediaRecentChanges(WikipediaQuery):

    URI_TEMPLATE = 'http://{}.wikipedia.org/w/api.php?rcstart={}&rcend={}'
    PARAMETERS = override_dict(
        WikipediaQuery.PARAMETERS, {
            "list": "recentchanges",
            "rcnamespace": 0,
            "rcshow": "!bot|!minor|!redirect",
            "rclimit": 500,
            "rcprop": "ids|title|comment|timestamp|tags|user",
            "rcdir": "newer"
        })
    GET_SCHEMA = {
        "args": {
            "type": "array",
            "items": [{
                "type": "string"
            }, {
                "type": "integer"
            }],
            "minItems": 3,
            "maxItems": 3
        },
        "kwargs": None
    }

    CONFIG_NAMESPACE = "wikipedia"
    WIKI_RESULTS_KEY = "recentchanges"

    class Meta:
        verbose_name = "Wikipedia recent changes"
        verbose_name_plural = "Wikipedia recent changes"

    def send(self, method, *args, **kwargs):
        args = (self.config.wiki_country, int(self.config.start_time),
                int(self.config.end_time))
        return super(WikipediaQuery, self).send(method, *args, **kwargs)
Example #10
0
class WikiDataItems(WikipediaAPI):

    URI_TEMPLATE = "https://www.wikidata.org/w/api.php?ids={}"

    PARAMETERS = override_dict(
        WikipediaAPI.PARAMETERS, {
            "action": "wbgetentities",
            "languages": "en",
            "redirects": "yes",
            "props": "info|claims|descriptions"
        })

    GET_SCHEMA = {
        "args": {
            "type": "array",
            "items": [{
                "type": "string"
            }],  # TODO: use a pattern?
        },
        "kwargs": None
    }

    class Meta:
        verbose_name = "Wikidata items"
        verbose_name_plural = "Wikidata items"

    def get_entity(self, snak):
        """
        Turns Wikidata into a more readable entity data structure

        :param (dict) snak: a Wikidata specific data structure: https://www.mediawiki.org/wiki/Wikibase/DataModel#Snaks
        :return: A tuple with the format (entity, is_item).
        """
        assert "property" in snak and "datatype" in snak and "snaktype" in snak, \
            "Wikidata snacs should have a property and datatype specified"
        if snak["snaktype"] == "novalue" or snak["snaktype"] == "somevalue":
            return {
                "property": snak["property"],
                "type": snak["datatype"],
                "value": None
            }, snak["datatype"] == "wikibase-item"
        value = snak["datavalue"]["value"]
        if snak["datatype"] == "wikibase-item":
            return {
                "property": snak["property"],
                "value": "Q{}".format(value["numeric-id"]),
                "type": value["entity-type"]
            }, True
        else:
            return {
                "property": snak["property"],
                "value": value,
                "type": snak["datatype"]
            }, False

    def get_item(self, raw_item_data):
        raw_claims = []
        for raw_claims_list in raw_item_data.get("claims", {}).values():
            raw_claims += raw_claims_list
        claim_entities = []
        references = set()
        for raw_claim in raw_claims:
            claim_entity, is_item = self.get_entity(raw_claim["mainsnak"])
            claim_entity["references"] = []
            for references_data in raw_claim.get("references", []):
                # We filter out the first item as the reference for all raw_references
                # This discards reference dates and reference URL data from the reference
                raw_references = []
                for raw_references_list in references_data["snaks"].values():
                    raw_references += raw_references_list
                raw_reference_entities = map(self.get_entity, raw_references)
                try:
                    reference_entity = next(
                        ref for ref, is_item in raw_reference_entities
                        if is_item)
                except StopIteration:
                    continue
                reference = "{}:{}".format(reference_entity["property"],
                                           reference_entity["value"])
                claim_entity["references"].append(reference)
                references.add(reference)
            claim_entities.append(claim_entity)
        item = raw_item_data
        try:
            item["description"] = item["descriptions"]["en"]["value"]
            del item["descriptions"]
        except KeyError:
            item["description"] = "No English description available"

        item["claims"] = claim_entities
        item["references"] = list(references)
        return item

    def handle_errors(self):
        content_type, data = super(WikiDataItems, self).content
        if data is not None and "error" in data:
            error_code = data["error"]["code"]
            self.set_error(self.ERROR_CODE_TO_STATUS[error_code])
        super(WikiDataItems, self).handle_errors()

    @property
    def content(self):
        content_type, data = super(WikiDataItems, self).content
        items = []
        for raw_item in data.get("entities", {}).values():
            items.append(self.get_item(raw_item))
        return content_type, items
Example #11
0
class WikipediaQuery(WikipediaAPI):

    URI_TEMPLATE = 'https://{}.wikipedia.org/w/api.php?{}={}'

    PARAMETERS = override_dict(WikipediaAPI.PARAMETERS, {
        "action": "query",
        "redirects": "1",
    })
    GET_SCHEMA = {"args": {}, "kwargs": None}

    WIKI_RESULTS_KEY = "pages"
    WIKI_QUERY_PARAM = "titles"
    ERROR_MESSAGE = "We did not find the page you were looking for. Perhaps you should create it?"

    def send(self, method, *args, **kwargs):
        args = (
            self.config.wiki_country,
            self.WIKI_QUERY_PARAM,
        ) + args
        return super(WikipediaQuery, self).send(method, *args, **kwargs)

    def handle_errors(self):
        super(WikipediaQuery, self).handle_errors()

        # Check general response
        content_type, data = self.content
        if "query" not in data:
            raise DSInvalidResource(
                'Wrongly formatted Wikipedia response, missing "query"',
                resource=self)
        response = data['query'][
            self.
            WIKI_RESULTS_KEY]  # Wiki has response hidden under single keyed dicts :(

        # When searching for pages a dictionary gets returned
        # TODO: parse the dictionary and possibly return partial content
        if isinstance(
                response,
                dict) and "-1" in response and "missing" in response["-1"]:
            self.status = 404
            raise DGHttpError40X(self.ERROR_MESSAGE, resource=self)
        # When making lists a list is returned
        elif isinstance(response, list) and not response:
            self.status = 404
            raise DGHttpError40X(self.ERROR_MESSAGE, resource=self)

    @property
    def content(self):
        content_type, data = super(WikipediaQuery, self).content
        if "warnings" in data:
            del (data["warnings"])
        return content_type, data

    def get_wikipedia_json(self):
        # TODO: remove this method and its uses when a partial content is possible
        response = json.loads(self.body)
        return response["query"][self.WIKI_RESULTS_KEY]

    def next_parameters(self):
        content_type, data = self.content
        return data.get("continue", {})

    class Meta:
        abstract = True