class WikipediaCategories(WikipediaQuery): PARAMETERS = override_dict( WikipediaQuery.PARAMETERS, { "cllimit": 500, "clshow": "", # gets set at runtime through "wiki_show_categories" config with "!hidden" by default "prop": "categories" # generator style: info|pageprops|categoryinfo }) def variables(self, *args): show = args[3] if len(args) > 3 else self.PARAMETERS["clshow"] variables = super(WikipediaCategories, self).variables(*args[:3]) variables["show"] = show return variables def parameters(self, **kwargs): params = dict(self.PARAMETERS) params["clshow"] = self.config.wiki_show_categories return params class Meta: verbose_name = "Wikipedia category" verbose_name_plural = "Wikipedia categories"
class WikipediaCategoryMembers(WikipediaGenerator): PARAMETERS = override_dict(WikipediaGenerator.PARAMETERS, { "generator": "categorymembers", "gcmlimit": 500, "gcmnamespace": 0, "prop": "info" }) WIKI_QUERY_PARAM = "gcmtitle"
class WikipediaTransclusions(WikipediaGenerator): PARAMETERS = override_dict(WikipediaGenerator.PARAMETERS, { "generator": "transcludedin", "gtishow": "!redirect", "gtilimit": 500 }) class Meta: verbose_name = "Wikipedia transclusions" verbose_name_plural = "Wikipedia transclusions"
class WikipediaRevisions(WikipediaPage): PARAMETERS = override_dict( WikipediaPage.PARAMETERS, { "prop": "revisions", "rvlimit": 500, "rvprop": "content|user|timestamp", "rvdir": "older" }) class Meta: verbose_name = "Wikipedia revisions" verbose_name_plural = "Wikipedia revisions"
class WikipediaCategoryMembers(WikipediaGenerator): PARAMETERS = override_dict( WikipediaGenerator.PARAMETERS, { "generator": "categorymembers", "gcmlimit": 100, "gcmnamespace": 0, "prop": "info|pageprops|categories", "clshow": "!hidden", "cllimit": 500, }) WIKI_QUERY_PARAM = "gcmtitle" class Meta: verbose_name = "Wikipedia category members" verbose_name_plural = "Wikipedia category members"
class WikipediaSearch(WikipediaQuery, WikipediaImagesMixin): URI_TEMPLATE = 'http://{}.wikipedia.org/w/api.php?{}={}' PARAMETERS = override_dict( WikipediaQuery.PARAMETERS, { "prop": "info|pageprops|extracts|categories", "exintro": 1, "clshow": "!hidden", "cllimit": 500 }) GET_SCHEMA = { "args": { "type": "array", "items": [{ "type": "string" }], "minItems": 3, "maxItems": 3 }, "kwargs": None } def parameters(self, **kwargs): parameters = copy(self.PARAMETERS) parameters["exintro"] = int(not self.config.wiki_full_extracts) return parameters def handle_errors(self): """ Handle ambiguity errors """ response = super(WikipediaSearch, self).handle_errors() if isinstance(response, dict): for page_id, page in six.iteritems(response): try: if "disambiguation" in page['pageprops']: self.status = 300 raise DSHttpWarning300("The search is ambiguous.", resource=self) except KeyError: pass class Meta: verbose_name = "Wikipedia search" verbose_name_plural = "Wikipedia searches"
class WikipediaTranslate(WikipediaPage): URI_TEMPLATE = 'http://{}.wiktionary.org/w/api.php?{}={}&iwprefix={}' # updated at runtime PARAMETERS = override_dict(WikipediaPage.PARAMETERS, { 'prop': 'info|pageprops|iwlinks', 'iwprop': 'url', }) def handle_errors(self): super(WikipediaTranslate, self).handle_errors() if not "iwlinks" in self.body: self.status = 404 raise DGHttpError40X( "No translations found for {} in {}".format(*self.meta), resource=self) @property def meta(self): try: return self.request["args"][2], self.request["args"][3] except (KeyError, IndexError, TypeError): return None, None
class WikipediaListPages(WikipediaQuery, WikipediaImagesMixin): PARAMETERS = override_dict(WikipediaQuery.PARAMETERS, { "prop": "info|pageprops|categories", "clshow": "!hidden", "cllimit": 500 }) GET_SCHEMA = { "args": { "type": "array", "items": [{ "type": "string" }], # TODO: validate with pattern? "minItems": 3, "maxItems": 3 }, "kwargs": None } WIKI_QUERY_PARAM = "pageids" class Meta: verbose_name = "Wikipedia list pages" verbose_name_plural = "Wikipedia list pages"
class WikipediaRecentChanges(WikipediaQuery): URI_TEMPLATE = 'http://{}.wikipedia.org/w/api.php?rcstart={}&rcend={}' PARAMETERS = override_dict( WikipediaQuery.PARAMETERS, { "list": "recentchanges", "rcnamespace": 0, "rcshow": "!bot|!minor|!redirect", "rclimit": 500, "rcprop": "ids|title|comment|timestamp|tags|user", "rcdir": "newer" }) GET_SCHEMA = { "args": { "type": "array", "items": [{ "type": "string" }, { "type": "integer" }], "minItems": 3, "maxItems": 3 }, "kwargs": None } CONFIG_NAMESPACE = "wikipedia" WIKI_RESULTS_KEY = "recentchanges" class Meta: verbose_name = "Wikipedia recent changes" verbose_name_plural = "Wikipedia recent changes" def send(self, method, *args, **kwargs): args = (self.config.wiki_country, int(self.config.start_time), int(self.config.end_time)) return super(WikipediaQuery, self).send(method, *args, **kwargs)
class WikiDataItems(WikipediaAPI): URI_TEMPLATE = "https://www.wikidata.org/w/api.php?ids={}" PARAMETERS = override_dict( WikipediaAPI.PARAMETERS, { "action": "wbgetentities", "languages": "en", "redirects": "yes", "props": "info|claims|descriptions" }) GET_SCHEMA = { "args": { "type": "array", "items": [{ "type": "string" }], # TODO: use a pattern? }, "kwargs": None } class Meta: verbose_name = "Wikidata items" verbose_name_plural = "Wikidata items" def get_entity(self, snak): """ Turns Wikidata into a more readable entity data structure :param (dict) snak: a Wikidata specific data structure: https://www.mediawiki.org/wiki/Wikibase/DataModel#Snaks :return: A tuple with the format (entity, is_item). """ assert "property" in snak and "datatype" in snak and "snaktype" in snak, \ "Wikidata snacs should have a property and datatype specified" if snak["snaktype"] == "novalue" or snak["snaktype"] == "somevalue": return { "property": snak["property"], "type": snak["datatype"], "value": None }, snak["datatype"] == "wikibase-item" value = snak["datavalue"]["value"] if snak["datatype"] == "wikibase-item": return { "property": snak["property"], "value": "Q{}".format(value["numeric-id"]), "type": value["entity-type"] }, True else: return { "property": snak["property"], "value": value, "type": snak["datatype"] }, False def get_item(self, raw_item_data): raw_claims = [] for raw_claims_list in raw_item_data.get("claims", {}).values(): raw_claims += raw_claims_list claim_entities = [] references = set() for raw_claim in raw_claims: claim_entity, is_item = self.get_entity(raw_claim["mainsnak"]) claim_entity["references"] = [] for references_data in raw_claim.get("references", []): # We filter out the first item as the reference for all raw_references # This discards reference dates and reference URL data from the reference raw_references = [] for raw_references_list in references_data["snaks"].values(): raw_references += raw_references_list raw_reference_entities = map(self.get_entity, raw_references) try: reference_entity = next( ref for ref, is_item in raw_reference_entities if is_item) except StopIteration: continue reference = "{}:{}".format(reference_entity["property"], reference_entity["value"]) claim_entity["references"].append(reference) references.add(reference) claim_entities.append(claim_entity) item = raw_item_data try: item["description"] = item["descriptions"]["en"]["value"] del item["descriptions"] except KeyError: item["description"] = "No English description available" item["claims"] = claim_entities item["references"] = list(references) return item def handle_errors(self): content_type, data = super(WikiDataItems, self).content if data is not None and "error" in data: error_code = data["error"]["code"] self.set_error(self.ERROR_CODE_TO_STATUS[error_code]) super(WikiDataItems, self).handle_errors() @property def content(self): content_type, data = super(WikiDataItems, self).content items = [] for raw_item in data.get("entities", {}).values(): items.append(self.get_item(raw_item)) return content_type, items
class WikipediaQuery(WikipediaAPI): URI_TEMPLATE = 'https://{}.wikipedia.org/w/api.php?{}={}' PARAMETERS = override_dict(WikipediaAPI.PARAMETERS, { "action": "query", "redirects": "1", }) GET_SCHEMA = {"args": {}, "kwargs": None} WIKI_RESULTS_KEY = "pages" WIKI_QUERY_PARAM = "titles" ERROR_MESSAGE = "We did not find the page you were looking for. Perhaps you should create it?" def send(self, method, *args, **kwargs): args = ( self.config.wiki_country, self.WIKI_QUERY_PARAM, ) + args return super(WikipediaQuery, self).send(method, *args, **kwargs) def handle_errors(self): super(WikipediaQuery, self).handle_errors() # Check general response content_type, data = self.content if "query" not in data: raise DSInvalidResource( 'Wrongly formatted Wikipedia response, missing "query"', resource=self) response = data['query'][ self. WIKI_RESULTS_KEY] # Wiki has response hidden under single keyed dicts :( # When searching for pages a dictionary gets returned # TODO: parse the dictionary and possibly return partial content if isinstance( response, dict) and "-1" in response and "missing" in response["-1"]: self.status = 404 raise DGHttpError40X(self.ERROR_MESSAGE, resource=self) # When making lists a list is returned elif isinstance(response, list) and not response: self.status = 404 raise DGHttpError40X(self.ERROR_MESSAGE, resource=self) @property def content(self): content_type, data = super(WikipediaQuery, self).content if "warnings" in data: del (data["warnings"]) return content_type, data def get_wikipedia_json(self): # TODO: remove this method and its uses when a partial content is possible response = json.loads(self.body) return response["query"][self.WIKI_RESULTS_KEY] def next_parameters(self): content_type, data = self.content return data.get("continue", {}) class Meta: abstract = True