class KulturnavBot(object): """Bot to enrich and create information on Wikidata from KulturNav info.""" EDIT_SUMMARY = 'import using #Kulturnav data' KULTURNAV_ID_P = '1248' GEONAMES_ID_P = '1566' SWE_KOMMUNKOD_P = '525' SWE_COUNTYKOD_P = '507' PLACE_P = '276' TIME_P = '585' # date DATASET_Q = None DISAMBIG_Q = '4167410' IS_A_P = '31' CATALOG_P = '972' DATASET_ID = None ENTITY_TYPE = None MAP_TAG = None COUNTRIES = [] # a list of country Q's ADMIN_UNITS = [] # a list of municipality+county Q's locations = {} # a dict of uuid to wikidata location matches current_uuid = '' # for debugging def __init__(self, dictGenerator, cache_max_age, verbose=False): """ Initialise the bot. Arguments: * generator - A generator that yields Dict objects. """ self.generator = dictGenerator self.repo = pywikibot.Site().data_repository() self.cutoff = None self.verbose = verbose self.require_wikidata = True self.cache_max_age = cache_max_age # trigger wdq query self.itemIds = helpers.fill_cache(self.KULTURNAV_ID_P, cache_max_age=cache_max_age) # set up WikidataStuff instance self.wd = WD(self.repo, self.EDIT_SUMMARY) # load lists self.COUNTRIES = wdqsLookup.wdq_to_wdqs(u'TREE[6256][][31]') self.ADMIN_UNITS = wdqsLookup.wdq_to_wdqs(u'TREE[15284][][31]') @classmethod def set_variables(cls, dataset_q=None, dataset_id=None, entity_type=None, map_tag=None, edit_summary=None): """Override any class variables. Used when command line arguments affect which type of run to do. @param dataset_q: the Q-id of the dataset @type dataset_q: str @param dataset_id: the uuid of the dataset @type dataset_id: str @param entity_type: the entity type to provide for the search API @type entity_type: str @param map_tag: the map_tag to use in the search API to find wikidata matches @type map_tag: str @param edit_summary: the edit_summary to use @type edit_summary: str """ cls.DATASET_Q = dataset_q or cls.DATASET_Q cls.DATASET_ID = dataset_id or cls.DATASET_ID cls.ENTITY_TYPE = entity_type or cls.ENTITY_TYPE cls.MAP_TAG = map_tag or cls.MAP_TAG cls.EDIT_SUMMARY = edit_summary or cls.EDIT_SUMMARY def run(self): """Start the robot.""" raise NotImplementedError("run() is not implemented in the base bot.") def runLayout(self, datasetRules, datasetProtoclaims, datasetSanityTest, label, shuffle): """ Execute the basic layout of a run. It should be called for a dataset-specific run which sets the parameters. param datasetRules: a dict of additional Rules or values to look for param datasetProtoclaims: a function for populating protoclaims param datasetSanityTest: a function which must return true for results to be written to Wikidata param label: the key in values to be used for label/alias. set to None to skip addNames() param shuffle: whether name/label/alias is shuffled or not i.e. if name = last, first """ count = 0 for hit in self.generator: # print count, self.cutoff if self.cutoff and count >= self.cutoff: break # some type of feedback if count % 100 == 0 and count > 0: pywikibot.output('%d entries handled...' % count) # Required rules/values to search for rules = { u'identifier': None, u'modified': None, u'seeAlso': None, u'sameAs': None, u'exactMatch': None, # not expected u'wikidata': None, u'libris-id': None, u'viaf-id': None, u'getty_aat': None, u'ulan': None } rules.update(datasetRules) # put together empty dict of values then populate values = {} for k in rules.keys(): values[k] = None if not self.populateValues(values, rules, hit): # continue with next hit if problem was encounterd continue # find the matching wikidata item hitItem = self.wikidataMatch(values) self.current_uuid = values['identifier'] # @todo: self.current_protoclaims # allows these to be accessed more easily # convert values to potential claims protoclaims = datasetProtoclaims(self, values) self.make_base_protoclaims(values, protoclaims) # output info for testing if self.verbose: pywikibot.output(values) pywikibot.output(protoclaims) pywikibot.output(hitItem) # Add information if a match was found if hitItem and hitItem.exists(): # if redirect then get target instead # make sure it passes the sanityTests if not self.sanityTest(hitItem): continue if not datasetSanityTest(self, hitItem): continue # add name as label/alias if label is not None: self.addNames(values[label], hitItem, shuffle=shuffle) # get the "last modified" timestamp and construct a Reference date = helpers.iso_to_WbTime(values[u'modified']) ref = self.make_ref(date) # add each property (if new) and source it self.addProperties(protoclaims, hitItem, ref) # allow for limited runs count += 1 # done pywikibot.output(u'Handled %d entries' % count) def populateValues(self, values, rules, hit): """ Populate values and check results given a hit. Given a list of values and a kulturnav hit, populate the values and check if result is problem free. @todo: raise Error instead of using problemFree solution param values: dict with keys and every value as None param rules: a dict with keys and values either: None: the exakt key is present in hit and its value is wanted a Rule: acording to the class above param hit: a kulturnav entry return bool problemFree """ ids = {} problemFree = True for entries in hit[u'@graph']: # populate ids for viaId rules if '@id' in entries.keys(): if entries['@id'] in ids.keys(): pywikibot.output('Non-unique viaID key: \n%s\n%s' % (entries, ids[entries['@id']])) ids[entries['@id']] = entries for entries in hit[u'@graph']: # handle rules for key, rule in rules.iteritems(): val = None if rule is None: if key in entries.keys(): val = entries[key] elif isinstance(rule, Rule): val = rule.resolve(entries, ids) # test and register found value if val is not None: if values[key] is None: values[key] = val else: pywikibot.output(u'duplicate entries for %s' % key) problemFree = False # the minimum which must have been identified if values[u'identifier'] is None: raise pywikibot.Error(u'Could not isolate the identifier from the ' u'KulturNav object! JSON layout must have ' u'changed. Crashing!') # dig into sameAs/exactMatch and seeAlso KulturnavBot.set_sameas_values(values) # only look at seeAlso if we found no Wikidata link and require one if self.require_wikidata and \ (not values[u'wikidata'] and values[u'seeAlso']): values[u'seeAlso'] = helpers.listify(values[u'seeAlso']) for sa in values[u'seeAlso']: if u'wikipedia' in sa: pywikibot.output(u'Found a Wikipedia link but no ' u'Wikidata link: %s %s' % (sa, values[u'identifier'])) problemFree = False if not problemFree: pywikibot.output(u'Found an issue with %s (%s), skipping' % (values['identifier'], values['wikidata'])) return problemFree def sanityTest(self, hitItem): """ Execute generic sanitytest which should be run independent on dataset. return bool """ return self.withoutClaimTest(hitItem, self.IS_A_P, self.DISAMBIG_Q, u'disambiguation page') def withoutClaimTest(self, hitItem, P, Q, descr): """ Execute base test that an item does not contain a particular claim. param hitItem: item to check param P: the property to look for param Q: the Q claim to look for param descr: a descriptive text return bool """ P = u'P%s' % P.lstrip('P') testItem = self.wd.QtoItemPage(Q) if self.wd.has_claim(P, testItem, hitItem): pywikibot.output(u'%s is matched to %s, ' u'FIXIT' % (hitItem.title(), descr)) return False else: return True def withClaimTest(self, hitItem, P, Q, descr, orNone=True): """ Execute base test that an item contains a certain claim. param hitItem: item to check param P: the property to look for param Q: (list) of Q claim to look for param descr: a descriptive text param orNone: if complete absence of the Property is also ok return bool """ P = u'P%s' % P.lstrip('P') Q = helpers.listify(Q) testItems = [] for q in Q: testItems.append(self.wd.QtoItemPage(q)) # check claims if P in hitItem.claims.keys(): for testItem in testItems: if self.wd.has_claim(P, testItem, hitItem): return True else: pywikibot.output(u'%s is identified as something other ' u'than a %s. Check!' % (hitItem.title(), descr)) return False elif orNone: # no P claim return True @staticmethod def set_sameas_values(values): """Isolate external identifiers through sameAs and exactMatch. @param values: All extracted values @type values: dict """ # merge sameAs and exactMatch match = helpers.bundle_values( [values[u'sameAs'], values[u'exactMatch']]) or [] # dig into sameAs/exactMatch and seeAlso for sa in match: if u'wikidata' in sa: values[u'wikidata'] = sa.split('/')[-1] elif u'libris-id' in values.keys() and \ u'libris.kb.se/auth/' in sa: values[u'libris-id'] = sa.split('/')[-1] elif u'viaf-id' in values.keys() and \ u'viaf.org/viaf/' in sa: values[u'viaf-id'] = sa.split('/')[-1] elif u'getty_aat' in values.keys() and \ u'vocab.getty.edu/aat/' in sa: values[u'getty_aat'] = sa.split('/')[-1] elif u'ulan' in values.keys() and \ u'vocab.getty.edu/ulan/' in sa: values[u'ulan'] = sa.split('/')[-1] def make_base_protoclaims(self, values, protoclaims): """Construct the protoclaims common for all KulturnavBots. Adds the claim to the protoclaims dict. @param values: the values extracted using the rules @type values: dict @param protoclaims: the dict of claims to add @type protoclaims: dict """ # kulturnav protoclaim incl. qualifier protoclaims[u'P%s' % self.KULTURNAV_ID_P] = \ WD.Statement(values[u'identifier']).addQualifier( WD.Qualifier( P=self.CATALOG_P, itis=self.wd.QtoItemPage(self.DATASET_Q)), force=True) # authority control protoclaims if values.get(u'libris-id'): protoclaims[u'P906'] = WD.Statement(values[u'libris-id']) if values.get(u'viaf-id'): protoclaims[u'P214'] = WD.Statement(values[u'viaf-id']) if values.get(u'getty_aat'): protoclaims[u'P1014'] = WD.Statement(values[u'getty_aat']) if values.get(u'ulan'): protoclaims[u'P245'] = WD.Statement(values[u'ulan']) def wikidataMatch(self, values): """ Find the matching wikidata item. Checks Wikidata first, then kulturNav. return ItemPage|None the matching item """ if values[u'identifier'] in self.itemIds: hitItemTitle = u'Q%s' % \ self.itemIds.get(values[u'identifier']) if not values[u'wikidata'] and not self.require_wikidata: # i.e. uuid has been supplied manually and exists on wikidata pass elif values[u'wikidata'] != hitItemTitle: # this may be caused by either being a redirect wd = self.wd.QtoItemPage(values[u'wikidata']) wi = self.wd.QtoItemPage(hitItemTitle) if wd.isRedirectPage() and wd.getRedirectTarget() == wi: pass elif wi.isRedirectPage() and wi.getRedirectTarget() == wd: pass else: pywikibot.output(u'Identifier missmatch (skipping): ' u'%s, %s, %s' % (values[u'identifier'], values[u'wikidata'], hitItemTitle)) return None elif values[u'wikidata']: hitItemTitle = values[u'wikidata'] else: # no match found return None # create ItemPage, bypassing any redirect hitItem = self.wd.bypassRedirect(self.wd.QtoItemPage(hitItemTitle)) # in case of redirect values[u'wikidata'] = hitItem.title() return hitItem def addNames(self, names, hitItem, shuffle=False): """ Prepare a nameObj or a list of such for add_label_or_alias(). param shuffle: bool if name order is last, first then this creates a local rearranged copy """ if names: if shuffle: namelist = [] if isinstance(names, dict): s = KulturnavBot.shuffle_names(names) if s is not None: namelist.append(s) elif isinstance(names, list): for n in names: s = KulturnavBot.shuffle_names(n) if s is not None: namelist.append(s) else: pywikibot.output(u'unexpectedly formatted name' u'object: %s' % names) if namelist: self.add_label_or_alias(namelist, hitItem) else: self.add_label_or_alias(names, hitItem) def addProperties(self, protoclaims, hitItem, ref): """ Add each property (if new) and source it. param protoclaims: a dict of claims with a key: Prop number val: Statement|list of Statments param hititem: the target entity param ref: WD.Reference """ for pcprop, pcvalue in protoclaims.iteritems(): if pcvalue: if isinstance(pcvalue, list): pcvalue = set(pcvalue) # eliminate potential duplicates for val in pcvalue: # check if None or a Statement(None) if (val is not None) and (not val.isNone()): self.wd.addNewClaim(pcprop, val, hitItem, ref) # reload item so that next call is aware of changes hitItem = self.wd.QtoItemPage(hitItem.title()) hitItem.exists() elif not pcvalue.isNone(): self.wd.addNewClaim(pcprop, pcvalue, hitItem, ref) # KulturNav specific functions def dbpedia2Wikidata(self, item): """ Convert dbpedia reference to the equivalent Wikidata item, if present. param item: dict with @language, @value keys return pywikibot.ItemPage|None """ if KulturnavBot.foobar(item): return if not all(x in item.keys() for x in (u'@value', u'@language')): pywikibot.output(u'invalid dbpedia entry: %s' % item) exit(1) # any site will work, this is just an example site = pywikibot.Site(item[u'@language'], 'wikipedia') page = pywikibot.Page(site, item[u'@value']) if page.properties().get(u'wikibase_item'): qNo = page.properties()[u'wikibase_item'] return self.wd.QtoItemPage(qNo) def db_gender(self, value): """Match gender values to items. Note that this returns a Statment unlike most other functions @param value: The gender value @type value: str @return: The gender item as a statement @rtype: WD.Statement or None """ known = { u'male': u'Q6581097', u'female': u'Q6581072', u'unknown': u'somevalue' } # a special case if value not in known.keys(): pywikibot.output(u'invalid gender entry: %s' % value) return if known[value] in (u'somevalue', u'novalue'): return WD.Statement(known[value], special=True) else: return WD.Statement(self.wd.QtoItemPage(known[value])) def db_name(self, name_obj, typ, limit=75): """Check if there is an item matching the name. A wrapper for helpers.match_name() to send it the relevant part of a nameObj. @param nameObj: {'@language': 'xx', '@value': 'xxx'} @type nameObj: dict @param typ: The name type (either 'lastName' or 'firstName') @type typ: str @param limit: Number of hits before skipping (defaults to 75, ignored if onLabs) @type limit: int @return: A matching item, if any @rtype: pywikibot.ItemPage, or None """ return helpers.match_name(name_obj['@value'], typ, self.wd, limit=limit) def location2Wikidata(self, uuid): """ Get location from kulturNav uuid. Given a kulturNav uuid or url this checks if that contains a GeoNames url and, if so, connects that to a Wikidata object using the GEONAMES_ID_P property (if any). NOTE that the WDQ results may be outdated return pywikibot.ItemPage|None """ # Check if uuid if not self.is_uuid(uuid): return None # Convert url to uuid if uuid.startswith(u'http://kulturnav.org'): uuid = uuid.split('/')[-1] # Check if already stored if uuid in self.locations.keys(): if self.locations[uuid] is None: return None else: qNo = u'Q%d' % self.locations[uuid] return self.wd.QtoItemPage(qNo) # retrieve various sources # @todo: this can be more streamlined by including wdq query for geonames # in that method. Possibly sharing the same "look-up and filter" # mechanism for both. # and then using self.locations[uuid] = self.extract... (which # returns qid or None) then (after both have been processed) # checking self.locations.get(uuid) before # making an ItemPage # # @todo: change self.locations and self.ADMIN_UNITS to include Q prefix (and thus have the methods return that) geo_sources = self.get_geo_sources(uuid) kulturarvsdata = self.extract_kulturarvsdata_location(geo_sources) if kulturarvsdata: self.locations[uuid] = kulturarvsdata qNo = u'Q%d' % self.locations[uuid] return self.wd.QtoItemPage(qNo) # retrieve hit through geonames-lookup geonames = KulturnavBot.extract_geonames(geo_sources) if geonames: # store as a resolved hit, in case wdq yields nothing self.locations[uuid] = None wdqQuery = u'STRING[%s:"%s"]' % (self.GEONAMES_ID_P, geonames) wdqResult = wdqsLookup.wdq_to_wdqs(wdqQuery) if wdqResult and len(wdqResult) == 1: self.locations[uuid] = wdqResult[0] qNo = u'Q%d' % self.locations[uuid] return self.wd.QtoItemPage(qNo) # else: # go to geonames and find wikidata from there # add to self.locations[uuid] # add GEONAMES_ID_P to the identified wikidata # no (clean) hits return None def get_geo_sources(self, uuid): """Extract any geosources from a kulturNav uuid. Given a kulturNav uuid return the corresponding properties of that target which are likely to contain geosources. @param uuid: uuid to check @type uuid: str @return: the matching properties @rtyp: list of dicts """ # debugging if not self.is_uuid(uuid): return [] query_url = 'http://kulturnav.org/api/%s' json_data = json.load(urllib2.urlopen(query_url % uuid)) sources = [] if json_data.get(u'properties'): same_as = json_data.get('properties').get('entity.sameAs') if same_as: sources += same_as source_uri = json_data.get('properties') \ .get('superconcept.sourceUri') if source_uri: sources += source_uri return sources @staticmethod def extract_geonames(sources): """Return any geonames ID given a list of get_geo_sources(). @param sources: output of get_geo_sources() @type sources: list of dicts @return: geonames id @rtype: str or None """ needle = 'http://sws.geonames.org/' for s in sources: if s.get('value') and s.get('value').startswith(needle): return s.get('value').split('/')[-1] return None def extract_kulturarvsdata_location(self, sources): """Return any qids matching kulturarvsdata geo authorities. @param sources: output of get_geo_sources() @type sources: list of dicts @return: the matching qid (without Q-prefix) @rtype: str or None @raises pywikibot.Error """ needle = u'http://kulturarvsdata.se/resurser/aukt/geo/' for s in sources: if s.get('value') and s.get('value').startswith(needle): s = s.get('value').split('/')[-1] wdq_query = None if s.startswith('municipality#'): code = s.split('#')[-1] wdq_query = u'STRING[%s:"%s"]' % (self.SWE_KOMMUNKOD_P, code) elif s.startswith('county#'): code = s.split('#')[-1] wdq_query = u'STRING[%s:"%s"]' % (self.SWE_COUNTYKOD_P, code) elif s.startswith('country#'): pass # handle via geonames instead elif s.startswith('parish#'): pass # no id's in wikidata else: raise pywikibot.Error(u'Unhandled KulturarvsdataLocation ' u'prefix: %s' % s) if wdq_query: # only here if a municipality or county was found wdq_result = wdqsLookup.wdq_to_wdqs(wdq_query) if wdq_result and len(wdq_result) == 1: self.ADMIN_UNITS.append(wdq_result[0]) return wdq_result[0] return None def getLocationProperty(self, item, strict=True): """ Return appropriate location property for an item. Given an ItemPage this returns the suitable property which should be used to indicate its location. P17 - land P131 - within administrative unit P276 - place param item: pywikibot.ItemPage|None param strict: bool whether place should be returned if no land or admin_unit hit return string|None """ if item is not None: q = int(item.title()[1:]) if q in self.COUNTRIES: return u'P17' elif q in self.ADMIN_UNITS: return u'P131' elif not strict: return u'P%s' % self.PLACE_P elif self.verbose: item.exists() pywikibot.output(u'Could not set location property for: ' u'%s (%s)' % (item.title(), item.labels.get('sv'))) return None def kulturnav2Wikidata(self, uuid): """Return Wikidata entity connected to a kulturNav uid or url. Relies on the KULTURNAV_ID_P property (if any) to get the connection. NOTE that the WDQ results may be outdated @param uuid: a kulturNav uuid or url @type uuid: str @return: the matching Wikidata item page @rtype: pywikibot.ItemPage or None """ # debugging if not self.is_uuid(uuid): return None # Convert url to uuid if uuid.startswith(u'http://kulturnav.org'): uuid = uuid.split('/')[-1] if uuid in self.itemIds.keys(): qNo = u'Q%d' % self.itemIds[uuid] return self.wd.QtoItemPage(qNo) else: return None def is_uuid(self, uuid): """Test if a string really is a uuid. @param uuid: uuid to test @type uuid: str @return: whether the test passed @rtype: bool """ if not helpers.is_str(uuid): pywikibot.output(u'Not an uuid in %s: %s' % (self.current_uuid, uuid)) return False uuid = uuid.split('/')[-1] # in case of url pattern = r'[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}' \ r'\-[0-9a-f]{4}\-[0-9a-f]{12}' m = re.search(pattern, uuid) if not m or m.group(0) != uuid: pywikibot.output(u'Not an uuid in %s: %s' % (self.current_uuid, uuid)) return False return True @staticmethod def shuffle_names(name_obj): """Detect a "Last, First" string and return as "First Last". A wrapper for helpers.reorder_names() to send it the relevant part of a name_obj. @param name_obj: {'@language': 'xx', '@value': 'xxx'} @type name_obj: dict @return: the reordered name_obj or None if reorder_names failed @rtype: dict or None """ name = helpers.reorder_names(name_obj['@value']) if name is None: return None name_obj = name_obj.copy() name_obj['@value'] = name return name_obj def make_ref(self, date): """Make a correctly formatted ref object for claims. Contains 4 parts: * P248: Stated in <the kulturnav dataset> * P577: Publication date <from the document> * P854: Reference url <using the current uuid> * P813: Retrieval date <current date> P854 Should be in source_test (after retroactively fixing older references) but by being in source_notest we ensure that duplicate uuids don't source the statement twice. @param date: The "last modified" time of the document @type date: pywikibot.WbTime @return: the formated reference @rtype WD.Reference """ reference_url = 'http://kulturnav.org/%s' % self.current_uuid ref = WD.Reference( source_test=self.wd.make_simple_claim( 'P248', self.wd.QtoItemPage(self.DATASET_Q)), source_notest=[ self.wd.make_simple_claim('P577', date), self.wd.make_simple_claim('P854', reference_url), self.wd.make_simple_claim('P813', helpers.today_as_WbTime()) ]) return ref def add_label_or_alias(self, name_obj, item, case_sensitive=False): """Add a name as either a label (if none already) or an alias. Essentially a filter for the more generic method in WikidatStuff. @param name_obj: {'@language': 'xx', '@value': 'xxx'} or a list of such @type name_obj: dict or list of dict @param item: the item to which the label/alias should be added @type item: pywikibot.ItemPage @param caseSensitive: whether the comparison is case sensitive @type caseSensitive: bool """ # for a list of entries if isinstance(name_obj, list): for n in name_obj: self.add_label_or_alias(n, item, case_sensitive=case_sensitive) # reload item so that next call is aware of any changes item = self.wd.QtoItemPage(item.title()) item.exists() return # for a single entry self.wd.addLabelOrAlias(name_obj['@language'], name_obj['@value'], item, caseSensitive=case_sensitive) @staticmethod def get_kulturnav_generator(uuids, delay=0): """Generate KulturNav items from a list of uuids. @param uuids: uuids to request items for @type uuids: list of str @param delay: delay in seconds between each kulturnav request @type delay: int @yield: dict """ for uuid in uuids: time.sleep(delay) try: json_data = KulturnavBot.get_single_entry(uuid) except pywikibot.Error as e: pywikibot.output(e) else: yield json_data @classmethod def get_search_results(cls, max_hits=250, require_wikidata=True): """Make a KulturNav search for all items of a given type in a dataset. @param max_hits: the maximum number of results to request at once @type max_hits: int @param require_wikidata: whether to filter results on having a wikidata url in sameAs @type require_wikidata: bool @return: the resulting uuids @rtype: list of str """ search_url = 'http://kulturnav.org/api/search/' + \ 'entityType:%s,' % cls.ENTITY_TYPE + \ 'entity.dataset_r:%s' % cls.DATASET_ID q = None # the map_tag query # only filter on MAP_TAG if filtering on wikidata if require_wikidata: search_url += ',%s' % cls.MAP_TAG + ':%s/%d/%d' q = '*%2F%2Fwww.wikidata.org%2Fentity%2FQ*' else: search_url += '/%d/%d' # start search results = [] offset = 0 overview_page = KulturnavBot.get_single_search_results( search_url, q, offset, max_hits) while overview_page: for item in overview_page: uuid = item[u'uuid'] if not require_wikidata or \ KulturnavBot.has_wikidata_in_sameas(item, cls.MAP_TAG): results.append(uuid) # continue offset += max_hits overview_page = KulturnavBot.get_single_search_results( search_url, q, offset, max_hits) # some feedback pywikibot.output(u'Found %d matching entries in Kulturnav' % len(results)) return results @staticmethod def has_wikidata_in_sameas(item, map_tag): """Check if a wikidata url is present in the sameAs property. @param item: the search item to check @type item: dict @param map_tag: the tag to use (concepts don't use sameAs) @type map_tag: str @rtype: bool """ # The patterns used if we filter on wikidata patterns = (u'http://www.wikidata.org/entity/', u'https://www.wikidata.org/entity/') same_as = item[u'properties'][map_tag[:map_tag.rfind('_')]] for s in same_as: if s[u'value'].startswith(patterns): return True return False @staticmethod def get_single_search_results(search_url, q, offset, max_hits): """Retrieve the results from a single API search. @param search_url: basic url from whih to build search @type search_url: str @param q: the map_tag query, if any @type q: str or None @param offset: the offset in search results @type offset: int @param max_hits: the maximum number of results to request at once @type max_hits: int @return: the search result object @rtype: dict """ actual_url = '' if q is None: actual_url = search_url % (offset, max_hits) else: actual_url = search_url % (q, offset, max_hits) search_page = urllib2.urlopen(actual_url) return json.loads(search_page.read()) @staticmethod def get_single_entry(uuid): """Retrieve the data on a single kulturnav entry. Raises an pywikibot.Error if: * @graph is not a key in the json response * a non-json response is received @param uuid: the uuid for the target item @type uuid: str @return: the entry object @rtype: dict @raise: pywikibot.Error """ query_url = 'http://kulturnav.org/%s?format=application/ld%%2Bjson' item_url = query_url % uuid try: record_page = urllib2.urlopen(item_url) json_data = json.loads(record_page.read()) except ValueError as e: raise pywikibot.Error('Error loading KulturNav item at ' '%s with error %s' % (item_url, e)) if json_data.get(u'@graph'): return json_data else: raise pywikibot.Error('No @graph in KulturNav reply at ' '%s\n data: %s' % (item_url, json_data)) @classmethod def main(cls, *args): """Start the bot from the command line.""" options = cls.handle_args(args) search_results = cls.get_search_results( max_hits=options['max_hits'], require_wikidata=options['require_wikidata']) kulturnav_generator = cls.get_kulturnav_generator( search_results, delay=options['delay']) kulturnav_bot = cls(kulturnav_generator, options['cache_max_age']) kulturnav_bot.cutoff = options['cutoff'] kulturnav_bot.require_wikidata = options['require_wikidata'] kulturnav_bot.run() @classmethod def run_from_list(cls, uuids, *args): """Start the bot with a list of uuids.""" options = cls.handle_args(args) kulturnav_generator = cls.get_kulturnav_generator( uuids, delay=options['delay']) kulturnav_bot = cls(kulturnav_generator, options['cache_max_age']) kulturnav_bot.cutoff = options['cutoff'] kulturnav_bot.require_wikidata = False kulturnav_bot.run() @staticmethod def handle_args(args): """Parse and load all of the basic arguments. Also passes any needed arguments on to pywikibot and sets any defaults. @param args: arguments to be handled @type args: list of strings @return: list of options @rtype: dict """ options = { 'cutoff': None, 'max_hits': 250, 'delay': 0, 'require_wikidata': True, 'cache_max_age': 0, } for arg in pywikibot.handle_args(args): option, sep, value = arg.partition(':') if option == '-cutoff': options['cutoff'] = int(value) elif option == '-max_hits': options['max_hits'] = int(value) elif option == '-delay': options['delay'] = int(value) elif option == '-any_item': options['require_wikidata'] = False elif option == '-wdq_cache': options['cache_max_age'] = int(value) return options @staticmethod def foobar(item): """Badly named escape mechanism for list results.""" if isinstance(item, list): pywikibot.output(FOO_BAR) return True return False
class ImporterBot(object): """Bot to enrich/create info on Wikidata for Australian heritage items.""" def __init__(self, base_path, new=False, cutoff=None, preview_file=None): """ Initialise the ImporterBot. :param base_path: path to the output directory :param new: whether to also create new items :param cutoff: the number of items to process before stopping. None being interpreted as all. :param preview_file: run in demo mode (create previews rather than live edits) and output the result to this file. """ self.repo = pywikibot.Site().data_repository() self.wd = WdS(self.repo, EDIT_SUMMARY) self.new = new self.cutoff = cutoff if preview_file: self.demo = True self.preview_file = path.join(base_path, preview_file) else: self.demo = False self.preview_data = [] self.set_references() self.place_id_p = 'P3008' # unique identifier property self.country = self.wd.QtoItemPage('Q408') self.states = self.make_states_map() self.settlements = self.make_settlements_map() self.hectares = self.wd.QtoItemPage(helpers.get_unit_q('ha')) self.make_status_and_instance_map() self.place_id_items = helpers.fill_cache_wdqs(self.place_id_p, no_strip=True) def set_references(self): """Set the three types of references needed.""" self.ref = { 'national': self.make_url_ref( 'http://data.gov.au/dataset/2016-soe-her-aus-national-heritage', # noqa '2017-07-21', '2017-06-07'), 'commonwealth': self.make_url_ref( 'http://data.gov.au/dataset/commonwealth-heritage-list', '2017-07-21', '2017-05-31') } self.coord_ref = { 'national': self.make_url_ref( 'http://www.environment.gov.au/heritage/places/national-heritage-list', # noqa '2017-08-13'), 'commonwealth': self.make_url_ref( 'https://data.gov.au/dataset/57720684-4948-45db-a2c8-37259d531d87', # noqa '2017-08-13', '2017-07-10') } def make_status_and_instance_map(self): """Construct mapping for cultural heritage status and instance type.""" self.status = { 'national': self.wd.QtoItemPage('Q20747146'), 'commonwealth': self.wd.QtoItemPage('Q30108476') } self.instance_type = { 'indigenous': self.wd.QtoItemPage('Q38048771'), 'historic': self.wd.QtoItemPage('Q38048707'), 'natural': self.wd.QtoItemPage('Q38048753') } def make_settlements_map(self): """Retrieve Australian settlements with state/territory connection.""" sparql = ( "SELECT DISTINCT ?city ?cityLabel ?admin ?adminLabel " "WHERE " "{ " "?city wdt:P31/wdt:P279* wd:Q486972 . " "?city wdt:P17 wd:Q408 . " "?city wdt:P131* ?admin . " "{ ?admin wdt:P31 wd:Q5852411 . }" "UNION" "{ ?admin wdt:P31 wd:Q14192252 . }" "UNION" "{ ?admin wdt:P31 wd:Q14192199 . }" 'SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }' # noqa "}") data = wdqs.make_simple_wdqs_query(sparql) settlements = dict() for d in data: state_qid = d['admin'].split('/')[-1] city_qid = d['city'].split('/')[-1] city_name = d['cityLabel'] if city_name not in settlements: settlements[city_name] = [] settlements[city_name].append({ 'state': state_qid, 'qid': city_qid }) return settlements def make_states_map(self): """ Retrieve the state/territory mappings from Wikidata. Also tries to match items for the EXT and OS codes. """ sparql = ("SELECT ?item ?iso " "WHERE " "{ " "?item wdt:P300 ?value . " "?item wdt:P17 wd:Q408 . " "BIND(REPLACE(?value, 'AU-', '', 'i') AS ?iso) " "}") data = wdqs.make_select_wdqs_query(sparql, 'item', 'iso') states = dict() for k, v in data.items(): states[v] = self.wd.QtoItemPage(k) # external territories (random hits mapped) states['EXT'] = { 'Ashmore and Cartier Islands': self.wd.QtoItemPage('Q133888'), "Australian Antarctic Territory|Dumont D'Urville Station|Mawson Station": self.wd.QtoItemPage('Q178994'), # noqa 'Christmas Island|Settlement|Drumsite|Poon Saan': self.wd.QtoItemPage('Q31063'), # noqa 'Cocos (Keeling) Islands': self.wd.QtoItemPage('Q36004'), 'Coral Sea Islands': self.wd.QtoItemPage('Q172216'), 'Heard and McDonald Islands': self.wd.QtoItemPage('Q131198'), 'Jervis Bay Territory': self.wd.QtoItemPage('Q15577'), 'Norfolk Island|Kingston|Longridge|Burnt Pine|Middlegate': self.wd.QtoItemPage('Q31057') # noqa } # OS other state? states['OS'] = { 'United Kingdom': self.wd.QtoItemPage('Q145'), 'USA': self.wd.QtoItemPage('Q30') } return states def make_url_ref(self, url, fetch_date, publish_date=None): """Make a Reference object for a url. Contains 3 parts: * P813: Retrieval date * P577: Publication date <from creation date of the document> * P854: Reference url <using the input url> :param url: the source url :param fetch_date: the retrieval date url (iso) :param publish_date: the retrieval date url (iso) :return: WdS.Reference """ date_claims = [] if publish_date: date_claims.append( self.wd.make_simple_claim('P577', helpers.iso_to_WbTime(publish_date))) date_claims.append( self.wd.make_simple_claim('P813', helpers.iso_to_WbTime(fetch_date))) ref = WdS.Reference( source_test=[self.wd.make_simple_claim('P854', url)], source_notest=date_claims) return ref def output_previews(self): """Output any PreviewItems to the preview_file.""" with open(self.preview_file, 'w', encoding='utf-8') as f: for preview in self.preview_data: f.write(preview.make_preview_page()) f.write('--------------\n\n') pywikibot.output('Created "{}" for previews'.format(self.preview_file)) def process_all_objects(self, data): """ Handle all the Australian heritage objects. Only increments counter when an object is updated. :param data: dict of all the heritage objects. """ count = 0 for place_id, entry_data in data.items(): if self.cutoff and count >= self.cutoff: break item = None if place_id in self.place_id_items: item = self.wd.QtoItemPage(self.place_id_items[place_id]) if item or self.new: self.process_single_object(entry_data, item) count += 1 def process_single_object(self, data, item): """ Process a single Australian heritage object. :param data: dict of data for a single object :param item: Wikidata item associated with an object, or None if one should be created. """ if not self.demo: item = item or self.create_new_place_id_item(data) item.exists() # load the item contents # Determine claims labels = self.make_labels(data) descriptions = self.make_descriptions(data) protoclaims = self.make_protoclaims(data) ref = self.ref[self.get_heritage_type(data['type'])] # Upload claims if self.demo: self.preview_data.append( PreviewItem(labels, descriptions, protoclaims, item, ref)) else: self.commit_labels(labels, item) self.commit_descriptions(descriptions, item) self.commit_claims(protoclaims, item, ref) def create_new_place_id_item(self, data): """ Create a new place_id item with some basic info and return it. :param data: dict of data for a single object :return: pywikibot.ItemPage """ labels = helpers.convert_language_dict_to_json(self.make_labels(data), typ='labels') desc = helpers.convert_language_dict_to_json( self.make_descriptions(data), typ='descriptions') id_claim = self.wd.make_simple_claim(self.place_id_p, data.get('place_id')) item_data = { "labels": labels, "descriptions": desc, "claims": [ id_claim.toJSON(), ] } try: return self.wd.make_new_item(item_data, EDIT_SUMMARY) except pywikibot.data.api.APIError as e: raise pywikibot.Error('Error during item creation: {:s}'.format(e)) def make_labels(self, data): """ Make a label object from the available info. :param data: dict of data for a single object :return: label dict """ labels = {} name = data.get('name') if name: labels['en'] = [ name.replace(' ', ' ').strip(), ] return labels def make_descriptions(self, data): """ Make a description object in English. Address is partitioned so as to include the place name and territory/state in case these are not included anywhere later. :param data: dict of data for a single object :return: description object """ text = '{heritage_type} {list_type} heritage site in {address}' descriptions = { 'en': text.format(heritage_type=data['class'].lower(), list_type=self.get_heritage_type(data['type']), address=data['address'].rpartition(',')[2].strip()) } return descriptions def commit_labels(self, labels, item): """ Add labels and aliases to item. :param labels: label object :param item: item to add labels to """ if labels: self.wd.add_multiple_label_or_alias(labels, item, case_sensitive=False) def commit_descriptions(self, descriptions, item): """ Add descriptions to item. :param descriptions: description object :param item: item to add descriptions to """ if descriptions: self.wd.add_multiple_descriptions(descriptions, item) def commit_claims(self, protoclaims, item, default_ref): """ Add each claim (if new) and source it. :param protoclaims: a dict of claims with key: Prop number val: Statement|list of Statements :param item: the target entity :param default_ref: main/default reference to use """ for prop, statements in protoclaims.items(): if statements: statements = helpers.listify(statements) statements = set(statements) # eliminate potential duplicates for statement in statements: # check if None or a Statement(None) if (statement is not None) and (not statement.isNone()): # use internal reference if present, else the general ref = statement.ref or default_ref self.wd.addNewClaim(prop, statement, item, ref) # reload item so that next call is aware of changes item = self.wd.QtoItemPage(item.title()) item.exists() def make_protoclaims(self, data): """ Construct potential claims for an entry. :param data: dict of data for a single heritage object """ protoclaims = dict() # P17: country protoclaims['P17'] = WdS.Statement(self.country) # P1435: heritage status heritage_type = self.get_heritage_type(data.get('type')) statement = WdS.Statement(self.status[heritage_type]) if data.get('register_date'): statement.addQualifier( WdS.Qualifier('P580', self.parse_date(data.get('register_date')))) protoclaims['P1435'] = statement # P31: class protoclaims['P31'] = WdS.Statement( self.instance_type[data.get('class').lower()]) # P3008: place_id protoclaims[self.place_id_p] = WdS.Statement(data['place_id']) # P131: state protoclaims['P131'] = WdS.Statement( self.get_state(data['state'], data['address'])) # P2046: area if data.get('hectares'): protoclaims['P2046'] = WdS.Statement( pywikibot.WbQuantity(data['hectares'], unit=self.hectares, site=self.wd.repo)) # P969: address if ',' in data['address']: protoclaims['P969'] = WdS.Statement(data['address']) # P276: place protoclaims['P276'] = WdS.Statement( self.get_place(data['state'], data['address'])) # P625: coordinate if data.get('lat') and data.get('lon'): protoclaims['P625'] = self.get_coordinate_statement( data.get('lat'), data.get('lon'), heritage_type) return protoclaims def get_coordinate_statement(self, lat, lon, heritage_type): """Construct a Statement for the provided coordinates.""" statement = WdS.Statement( pywikibot.Coordinate(float(lat), float(lon), globe='earth', precision=DEFAULT_PREC)) statement.add_reference(self.coord_ref[heritage_type]) return statement def get_heritage_type(self, typ): """Determine which heritage type the object is.""" heritage_type = None if typ.startswith('Q1116950'): heritage_type = 'commonwealth' elif typ.startswith('Q781601'): heritage_type = 'national' else: pywikibot.error('Unrecognized status: {0}'.format(typ)) return heritage_type def parse_date(self, date): """Convert date in DD-MMM-YYYY format to WbTime.""" months = [ 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC' ] dd, mmm, yyyy = date.split('-') iso = '{year}-{month:02d}-{day:02d}'.format(year=yyyy, day=int(dd), month=months.index(mmm) + 1) return helpers.iso_to_WbTime(iso) def get_place(self, state, address): """ Determine which settlement the object is in. The format of address is "street, place STATE_ISO" """ place = address.rpartition(',')[2][:-len(state)].strip() state_item = self.get_state(state, address) if place in self.settlements and state_item: hits = [] for candidate in self.settlements[place]: if candidate['state'] == state_item.id: hits.append(candidate['qid']) if len(set(hits)) == 1: return self.wd.QtoItemPage(hits[0]) def get_state(self, state, address): """Determine which state/territory the object is in.""" state_item = None if state not in self.states: pywikibot.error('Unrecognized state: {0}'.format(state)) elif state == 'EXT': address = address[:-len('EXT')].strip() for key, v in self.states['EXT'].items(): if any(address.endswith(k) for k in key.split('|')): state_item = v break elif state == 'OS': for k, v in self.states['OS'].items(): if address.endswith(k): state_item = v break else: state_item = self.states[state] return state_item
class WikidataItem(object): def __init__(self, db_row_dict, repository, data_files, existing, caches): self.repo = repository self.existing = existing self.wdstuff = WDS(self.repo) self.raw_data = db_row_dict self.caches = caches self.problem_report = {} self.props = data_files["properties"] self.construct_wd_item() self.problem_report = {} def get_caches(self): return self.caches def make_q_item(self, qnumber): return self.wdstuff.QtoItemPage(qnumber) def make_pywikibot_item(self, value): val_item = None if isinstance(value, list) and len(value) == 1: value = value[0] if utils.string_is_q_item(value): val_item = self.make_q_item(value) elif value == "novalue": val_item = value elif isinstance(value, dict) and 'monolingual_value' in value: text = value['monolingual_value'] language = value['lang'] val_item = pywikibot.WbMonolingualText(text=text, language=language) elif isinstance(value, dict) and 'quantity_value' in value: number = value['quantity_value'] if 'unit' in value: unit = self.wdstuff.QtoItemPage(value["unit"]) else: unit = None val_item = pywikibot.WbQuantity(amount=number, unit=unit, site=self.repo) elif isinstance(value, dict) and 'date_value' in value: date_dict = value["date_value"] val_item = pywikibot.WbTime(year=date_dict.get("year"), month=date_dict.get("month"), day=date_dict.get("day")) elif value == "novalue": # raise NotImplementedError # implement Error print("Status: novalue will be added here") else: val_item = value return val_item def make_statement(self, value): if value in ['somevalue', 'novalue']: special = True else: special = False return self.wdstuff.Statement(value, special=special) def make_qualifier_applies_to(self, value): prop_item = self.props["applies_to_part"] target_item = self.wdstuff.QtoItemPage(value) return self.wdstuff.Qualifier(prop_item, target_item) def add_statement(self, prop_name, value, quals=None, ref=None): base = self.wd_item["statements"] prop = self.props[prop_name] if quals is None: quals = [] wd_claim = self.make_pywikibot_item(value) statement = self.make_statement(wd_claim) for qual in helpers.listify(quals): statement.addQualifier(qual) base.append({"prop": prop, "value": statement, "ref": ref}) def make_stated_in_ref(self, value, pub_date=None, ref_url=None, retrieved_date=None): item_prop = self.props["stated_in"] published_prop = self.props["publication_date"] published_claim = None if pub_date: pub_date = utils.date_to_dict(pub_date, "%Y-%m-%d") timestamp = self.make_pywikibot_item({"date_value": pub_date}) published_claim = self.wdstuff.make_simple_claim( published_prop, timestamp) source_item = self.wdstuff.QtoItemPage(value) source_claim = self.wdstuff.make_simple_claim(item_prop, source_item) if ref_url and retrieved_date: ref_url_prop = self.props["reference_url"] retrieved_date_prop = self.props["retrieved"] retrieved_date = utils.date_to_dict(retrieved_date, "%Y-%m-%d") retrieved_date = self.make_pywikibot_item( {"date_value": retrieved_date}) ref_url_claim = self.wdstuff.make_simple_claim( ref_url_prop, ref_url) retrieved_on_claim = self.wdstuff.make_simple_claim( retrieved_date_prop, retrieved_date) if published_claim: ref = self.wdstuff.Reference( source_test=[source_claim, ref_url_claim], source_notest=[published_claim, retrieved_on_claim]) else: ref = self.wdstuff.Reference( source_test=[source_claim, ref_url_claim], source_notest=[retrieved_on_claim]) else: ref = self.wdstuff.Reference(source_test=[source_claim], source_notest=published_claim) return ref def associate_wd_item(self, wd_item): if wd_item is not None: self.wd_item["wd-item"] = wd_item def set_upload(self, booln): self.wd_item["upload"] = booln def add_label(self, language, text): base = self.wd_item["labels"] base.append({"language": language, "value": text}) def add_description(self, language, text): base = self.wd_item["descriptions"] base.append({"language": language, "value": text}) def add_to_report(self, key_name, raw_data, id_no, prop_name=None): """ Add data to problem report json. Check if item has an associated Q-number, and if that's the case and it's missing in the report, add it to the report automatically. Optionally, assign a Property ID that the data should have been used as a value for. :param key_name: name of the field containing the problematic data, e.g. the header of the column :type key_name: string :param raw_data: the data that we failed to process :type raw_data: string :param id_no: unique id assigned to item, e.g. url :type id_no: string :param prop_name: name of the property, as stated in the props library file :type prop_name: string """ prop = None if prop_name: if prop_name.startswith('_'): prop = prop_name else: prop = self.props.get(prop_name) self.problem_report[key_name] = {"value": raw_data, "target": prop} if "wd-item" not in self.problem_report: if self.wd_item["wd-item"] is not None: self.problem_report["Q"] = self.wd_item["wd-item"] else: self.problem_report["Q"] = "" self.problem_report["url"] = id_no def print_report(self): """Print the problem report on screen.""" print( json.dumps(self.problem_report, sort_keys=True, indent=4, ensure_ascii=False, default=utils.datetime_convert)) def get_report(self): """Retrieve the problem report.""" return self.problem_report def construct_wd_item(self): self.wd_item = {} self.wd_item["upload"] = True self.wd_item["statements"] = [] self.wd_item["labels"] = [] self.wd_item["descriptions"] = [] self.wd_item["wd-item"] = None
class RiksdagsBot(object): """Bot to enrich and add information on Wikidata based on Riksdag info.""" EDIT_SUMMARY = 'RiksdagsBot' FUTURE_YEAR = 2016 # dates in this year or later are the future names = { # a dict of found first/last_name_Q lookups u'lastName': {}, u'firstName': {}} current_id = '' # for debugging def __init__(self, dictGenerator, verbose=False): """Instantiate a RiksdagsBot object. param dictGenerator: A generator that yields Dict objects. param verbose: If Bot should operate in Verbose mode, default=False """ self.generator = dictGenerator self.repo = pywikibot.Site().data_repository() self.cutoff = None self.verbose = verbose # load mappings self.mappings = helpers.load_json_file('mappings.json', force_path=__file__) # trigger wdq query self.itemIds = helpers.fill_cache(RIKSDAG_ID_P) # set up WikidataStuff object self.wd = WD(self.repo) def run(self): """Start the bot.""" # run over all matches (up to cutoff) # for each fetch the riksdagsdata json # load the data then data = data['personlista']['person'] # send the data for processing and recieve claims # store the updates and continue with next raise NotImplementedError("Please Implement this method") def extractStatements(self, riksdagdata): """Extract possible statements from the riksdag data. param riksdagsdata: a dict return dict of properties and statments """ riksdagId = riksdagdata['intressent_id'] self.current_id = riksdagId # Handle statments protoclaims = {} protoclaims[GENDER_P] = self.matchGender(riksdagdata['kon']) protoclaims[PARTY_P] = self.matchParty(riksdagdata['parti']) protoclaims[LAST_NAME_P] = self.matchName( riksdagdata['efternamn'], 'lastName') protoclaims[FIRST_NAME_P] = self.matchName( riksdagdata['tilltalsnamn'], 'firstName') protoclaims[BIRTH_DATE_P] = self.matchBirth(riksdagdata['fodd_ar']) protoclaims[DEATH_DATE_P] = self.matchDeath(riksdagdata['status']) # position data is inconsistent as single entries are sometimes # not in a list. hence the listify protoclaims[POSITION_P] = self.handlePositions( helpers.listify(riksdagdata['personuppdrag']['uppdrag'])) # valkrets # personuppgifter # Handle aliases # Note that this gives a mistake in names such as "A von B" since # the "von" is not part of the sort key. fullName = helpers.reorder_names(riksdagdata['sorteringsnamn']) iortAlias = self.makeIortAlias(riksdagdata['iort'], fullName) names = set(fullName) if iortAlias: names.add(iortAlias) names = list(names) return protoclaims, names def matchName(self, value, nameType): """Match value of name against its wikidata entity. param value: str|unicode param nameType: str|unicode return: WD.Statement|None """ item = helpers.match_name(value, nameType, self.wd) if item: return WD.Statement(item) return None def matchGender(self, value): """Match value of gender against known mappings. param value: str|unicode return: WD.Statement|None """ if value in self.mappings['kon']['Q'].keys(): item = self.wd.QtoItemPage(self.mappings['kon']['Q'][value]) return WD.Statement(item) return None def matchParty(self, value): """Match value of political party against known mappings. param value: str|unicode return: WD.Statement|None """ if value in self.mappings['parti']['Q'].keys(): item = self.wd.QtoItemPage(self.mappings['parti']['Q'][value]) return WD.Statement(item) elif value in self.mappings['parti']['skip'].keys() or value is None: return None else: pywikibot.output(u'Encountered an unknown political party: %s (%s)' % (value, self.current_id)) return None def matchBirth(self, value): """Convert value of birth to statement. param value: str|unicode return: WD.Statement|None """ if value is None or not value.strip(): return None return WD.Statement(helpers.iso_to_WbTime(value)) def matchDeath(self, value): """Extract death date from status. param value: str|unicode return: WD.Statement|None """ if value and value.startswith('Avliden'): value = value[len('Avliden'):].strip() return WD.Statement(helpers.iso_to_WbTime(value)) return None def makeIortAlias(self, iort, name): """Use iort info to create an alias. param iort: str|unicode|None param name: str|unicode return: WD.Statement|None """ if iort is None or not iort.strip(): return None elif name is None or not name.strip(): return None alias = u'%s i %s' % (name, iort) return alias def handlePositions(self, uppdragList): """Construct qualified statements for help positions. param uppdragList: list of uppdrag dicts return: list of position statements """ uppdragStatements = [] for uppdrag in uppdragList: if uppdrag['typ'] == u'kammaruppdrag': uppdragStatements.append(self.handleChamberPosition(uppdrag)) elif uppdrag['typ'] == u'partiuppdrag': uppdragStatements.append(self.handlePartyPosition(uppdrag)) elif uppdrag['typ'] == u'Departement': uppdragStatements.append(self.handleMinistryPosition(uppdrag)) elif uppdrag['typ'] == u'uppdrag': uppdragStatements.append(self.handleCommitteePosition(uppdrag)) elif uppdrag['typ'] == u'talmansuppdrag': uppdragStatements.append(self.handleSpeakerPosition(uppdrag)) elif uppdrag['typ'] in (u'Riksdagsorgan', u'Europaparlamentet'): # consider getting stadsråd-departement from Riksdagsorgan # Europaparlamentet is likely very different pass else: pywikibot.output('uppdrag-typ-roll: %s-%s-%s (%s)' % (uppdrag['typ'], uppdrag['roll_kod'], uppdrag['uppgift'], self.current_id)) pass return uppdragStatements def handleChamberPosition(self, uppdrag): """Process positions as member of Parliament. param uppdrag: dict return WD.Statment|None """ # only considered some positions roleMap = 'kammar_roll' roleCode = uppdrag['roll_kod'] if not self.test_role_code(roleCode, roleMap, uppdrag): return None # only keep certain statuses if uppdrag['status'] not in self.mappings['kammar_status']['keep']: if uppdrag['status'] not in self.mappings['kammar_status']['skip']: pywikibot.output(u'Unknown status: %s (%s-%s)' % (uppdrag['status'], uppdrag['typ'], self.current_id)) return None # expect uppgift = None but keep a note of any new ones badComments = self.mappings['kammar_uppgift']['skip'] if uppdrag['uppgift'] is not None: if uppdrag['uppgift'] not in badComments: pywikibot.output(u'Non-None uppgift: %s (%s-%s)' % (uppdrag['uppgift'], uppdrag['typ'], self.current_id)) return None # create statement based on role qNo = self.mappings[roleMap]['Q'][roleCode] statement = WD.Statement(self.wd.QtoItemPage(qNo)) # add standard qualifiers helpers.add_start_end_qualifiers( statement, uppdrag['from'], self.notFuture(uppdrag['tom'])) self.addOrdinal(uppdrag['ordningsnummer'], statement) return statement def handlePartyPosition(self, uppdrag): """Process positions within a party. param uppdrag: dict return WD.Statment|None """ return self.unifiedPositionHandler(uppdrag, 'parti_roll', 'parti') def handleMinistryPosition(self, uppdrag): """Process positions within a Ministry. Skips adding the ministry since this is normally implicit in the title. This is not the case for some of the skipped roles. param uppdrag: dict return WD.Statment|None """ return self.unifiedPositionHandler(uppdrag, 'departement_roll', None) def handleCommitteePosition(self, uppdrag): """Process positions within a Committee of the Riksdag. param uppdrag: dict return WD.Statment|None """ return self.unifiedPositionHandler(uppdrag, 'utskott_roll', 'utskott') def handleSpeakerPosition(self, uppdrag): """Process positions as Speaker of the Riksdag. param uppdrag: dict return WD.Statment|None """ return self.unifiedPositionHandler(uppdrag, 'talman_roll', None) def unifiedPositionHandler(self, uppdrag, roleMap, entityMap): """Process position based on known mappings. Process positions within a Committee of the Riksdag or a party. param uppdrag: dict param roleMap: str, key within mappings.json param entityMap: str|None, key within mappings.json, skip if None return WD.Statment|None """ # only considered some positions roleCode = uppdrag['roll_kod'] if not self.test_role_code(roleCode, roleMap, uppdrag): return None # expect status = None if uppdrag['status'] is not None: pywikibot.output('Non-none status: %s (%s-%s)' % (uppdrag['status'], uppdrag['typ'], self.current_id)) return None # create statement based on role qNo = self.mappings[roleMap]['Q'][roleCode] statement = WD.Statement(self.wd.QtoItemPage(qNo)) # identify entity if entityMap: entityCode = uppdrag['organ_kod'].upper() if entityCode in self.mappings[entityMap]['Q'].keys(): qNo = self.mappings[entityMap]['Q'][entityCode] qual = WD.Qualifier( P=OF_P, itis=self.wd.QtoItemPage(qNo)) statement.addQualifier(qual) else: pywikibot.output('Unknown entity: %s-%s (%s-%s)' % (entityCode, uppdrag['uppgift'], uppdrag['typ'], self.current_id)) # add standard qualifiers helpers.add_start_end_qualifiers( statement, uppdrag['from'], self.notFuture(uppdrag['tom'])) self.addOrdinal(uppdrag['ordningsnummer'], statement) return statement def test_role_code(self, role_code, role_map, uppdrag): """Test if a role_code is mapped and not marked for skipping. Outputs if an unmapped role was found. param role_code: str param role_map: str, key within mappings.json param uppdrag: dict return: bool """ if role_code not in self.mappings[role_map]['Q'].keys(): if role_code not in self.mappings[role_map]['skip'].keys(): pywikibot.output('Unknown role: %s (%s-%s)' % (role_code, uppdrag['typ'], self.current_id)) return False return True def notFuture(self, date): """Check that a date is not in the future. Checks if a date is in the future. If so returns none, else returns the string. TODO: Remake this using current date and comparing down to day param date: ISO date string return: str|None """ if helpers.is_int(date[:4]) and int(date[:4]) < self.FUTURE_YEAR: return date return None def addOrdinal(self, value, statement): """Add an ordinal qualifier. Adds the ordinal as a qualifier if it is non-zero param value: ordinal param statment: statment to add qualifer to """ if value != '0': qual = WD.Qualifier( P=ORDINAL_P, itis=int(value)) statement.addQualifier(qual) def testRun(self): """Run a test with hardcoded local files.""" dataFiles = ( # u'0574555227504.json', # u'0787533297400.json', # a death # u'0643844865712.json', # a von, ersättare, utskott # u'0108961111006.json', # an iort # u'0284192765516.json', # many different positions/roles # various issues (same position double ministries), one day # positions, overlaps etc. u'0956595590924.json', ) for dataFile in dataFiles: data = helpers.load_json_file(dataFile) data = data['personlista']['person'] self.extractStatements(data) def testRun2(self): """Run a test with a folder of local files.""" dataFiles = helpers.find_files(u'persons', ('.json',), False) # print len(dataFiles) for dataFile in dataFiles: data = helpers.load_json_file(dataFile) try: data = data['person'] self.extractStatements(data) except KeyError: pywikibot.output("%s contains no data" % dataFile)
class WikidataItem(object): def __init__(self, db_row_dict, repository, data_files, existing): self.repo = repository self.existing = existing self.wdstuff = WDS(self.repo) self.raw_data = db_row_dict self.props = data_files["properties"] self.construct_wd_item() self.problem_report = {} def make_q_item(self, qnumber): return self.wdstuff.QtoItemPage(qnumber) def make_pywikibot_item(self, value): val_item = None if isinstance(value, list) and len(value) == 1: value = value[0] if utils.string_is_q_item(value): val_item = self.make_q_item(value) elif value == "novalue": val_item = value elif isinstance(value, dict) and 'quantity_value' in value: number = value['quantity_value'] if 'unit' in value: unit = self.wdstuff.QtoItemPage(value["unit"]) else: unit = None val_item = pywikibot.WbQuantity(amount=number, unit=unit, site=self.repo) elif isinstance(value, dict) and 'date_value' in value: date_dict = value["date_value"] val_item = pywikibot.WbTime(year=date_dict.get("year"), month=date_dict.get("month"), day=date_dict.get("day")) elif value == "novalue": # raise NotImplementedError # implement Error print("Status: novalue will be added here") else: val_item = value return val_item def make_statement(self, value): if value in ['somevalue', 'novalue']: special = True else: special = False return self.wdstuff.Statement(value, special=special) def make_qualifier_applies_to(self, value): prop_item = self.props["applies_to_part"] target_item = self.wdstuff.QtoItemPage(value) return self.wdstuff.Qualifier(prop_item, target_item) def add_statement(self, prop_name, value, quals=None, ref=None): base = self.wd_item["statements"] prop = self.props[prop_name] if quals is None: quals = [] wd_claim = self.make_pywikibot_item(value) statement = self.make_statement(wd_claim) for qual in helpers.listify(quals): statement.addQualifier(qual) base.append({"prop": prop, "value": statement, "ref": ref}) def make_stated_in_ref(self, value, pub_date, ref_url=None, retrieved_date=None): item_prop = self.props["stated_in"] published_prop = self.props["publication_date"] pub_date = utils.date_to_dict(pub_date, "%Y-%m-%d") timestamp = self.make_pywikibot_item({"date_value": pub_date}) published_claim = self.wdstuff.make_simple_claim( published_prop, timestamp) source_item = self.wdstuff.QtoItemPage(value) source_claim = self.wdstuff.make_simple_claim(item_prop, source_item) if ref_url and retrieved_date: ref_url_prop = self.props["reference_url"] retrieved_date_prop = self.props["retrieved"] retrieved_date = utils.date_to_dict(retrieved_date, "%Y-%m-%d") retrieved_date = self.make_pywikibot_item( {"date_value": retrieved_date}) ref_url_claim = self.wdstuff.make_simple_claim( ref_url_prop, ref_url) retrieved_on_claim = self.wdstuff.make_simple_claim( retrieved_date_prop, retrieved_date) ref = self.wdstuff.Reference( source_test=[source_claim, ref_url_claim], source_notest=[published_claim, retrieved_on_claim]) else: ref = self.wdstuff.Reference(source_test=[source_claim], source_notest=published_claim) return ref def associate_wd_item(self, wd_item): if wd_item is not None: self.wd_item["wd-item"] = wd_item def add_label(self, language, text): base = self.wd_item["labels"] base.append({"language": language, "value": text}) def add_description(self, language, text): base = self.wd_item["descriptions"] base.append({"language": language, "value": text}) def construct_wd_item(self): self.wd_item = {} self.wd_item["upload"] = True self.wd_item["statements"] = [] self.wd_item["labels"] = [] self.wd_item["descriptions"] = [] self.wd_item["wd-item"] = None
class WikidataItem(object): """Basic data object for upload to Wikidata.""" def __init__(self, db_row_dict, repository, data_files, existing): """ Initialize the data object. :param db_row_dict: raw data from the data source :type db_row_dict: string :param repository: data repository (Wikidata site) :type repository: site instance :param data_files: dict of various mapping files :type data_files: dictionary :param existing: WD items that already have an unique id :type existing: dictionary """ self.repo = repository self.existing = existing self.wdstuff = WDS(self.repo) self.raw_data = db_row_dict self.props = data_files["properties"] self.items = data_files["items"] self.construct_wd_item() self.problem_report = {} def make_q_item(self, qnumber): """ Create a regular Wikidata ItemPage. :param qnumber: Q-item that we want to get an ItemPage of :type qnumber: string :return: an ItemPage for pywikibot """ return self.wdstuff.QtoItemPage(qnumber) def make_pywikibot_item(self, value): """ Create a statement in pywikibot-ready format. The statement can be either: * a string (value is string) * an item (value is Q-string) * an amount with or without unit (value is dic) :param value: the content of the item :type value: it can be a string or a dictionary, see above. :return: a pywikibot item of the type determined by the input data, either ItemPage or Quantity or string. """ val_item = None if isinstance(value, list) and len(value) == 1: value = value[0] if utils.string_is_q_item(value): val_item = self.make_q_item(value) elif value == "novalue": val_item = value elif isinstance(value, dict) and 'quantity_value' in value: number = value['quantity_value'] if 'unit' in value: unit = self.wdstuff.QtoItemPage(value["unit"]) else: unit = None val_item = pywikibot.WbQuantity( amount=number, unit=unit, site=self.repo) elif isinstance(value, dict) and 'date_value' in value: date_dict = value["date_value"] val_item = pywikibot.WbTime(year=date_dict["year"], month=date_dict["month"], day=date_dict["day"]) elif value == "novalue": # raise NotImplementedError # implement Error print("Status: novalue will be added here") else: val_item = value return val_item def make_statement(self, value): """ Create a Wikidatastuff statement. Supports the special data types 'somevalue' and 'novalue'. :prop value: the content of the statement :type value: pywikibot item :return: a wikidatastuff statement """ if value in ['somevalue', 'novalue']: special = True else: special = False return self.wdstuff.Statement(value, special=special) def make_qualifier_applies_to(self, value): """ Create a qualifier to a statement with type 'applies to part'. :param value: Q-item that this applies to :type value: string :return: a wikidatastuff Qualifier """ prop_item = self.props["applies_to_part"] target_item = self.wdstuff.QtoItemPage(value) return self.wdstuff.Qualifier(prop_item, target_item) def add_statement(self, prop_name, value, quals=None, ref=None): """ Add a statement to the data object. :param prop_name: P-item representing property :type prop_name: string :param value: content of the statement :type value: it can be a string representing a Q-item or a dictionary of an amount :param quals: possibly qualifier items :type quals: a wikidatastuff Qualifier item, or a list of them :param ref: reference item :type ref: a wikidatastuff Reference item """ base = self.wd_item["statements"] prop = self.props[prop_name] if quals is None: quals = [] wd_claim = self.make_pywikibot_item(value) statement = self.make_statement(wd_claim) for qual in helpers.listify(quals): statement.addQualifier(qual) base.append({"prop": prop, "value": statement, "ref": ref}) def make_stated_in_ref(self, value, pub_date, ref_url=None, retrieved_date=None): """ Make a reference object of type 'stated in'. :param value: Q-item where sth is stated :type value: string :param pub_date: timestamp in format "1999-09-31" :type pub_date: string :param ref_url: optionally a reference url :type ref_url: string :param retrieved_date: timestamp in format "1999-09-31" :type retrieved_date: string :return: a wikidatastuff Reference item """ item_prop = self.props["stated_in"] published_prop = self.props["publication_date"] pub_date = utils.date_to_dict(pub_date, "%Y-%m-%d") timestamp = self.make_pywikibot_item({"date_value": pub_date}) published_claim = self.wdstuff.make_simple_claim( published_prop, timestamp) source_item = self.wdstuff.QtoItemPage(value) source_claim = self.wdstuff.make_simple_claim(item_prop, source_item) if ref_url and retrieved_date: ref_url_prop = self.props["reference_url"] retrieved_date_prop = self.props["retrieved"] retrieved_date = utils.date_to_dict(retrieved_date, "%Y-%m-%d") retrieved_date = self.make_pywikibot_item( {"date_value": retrieved_date}) ref_url_claim = self.wdstuff.make_simple_claim( ref_url_prop, ref_url) retrieved_on_claim = self.wdstuff.make_simple_claim( retrieved_date_prop, retrieved_date) ref = self.wdstuff.Reference( source_test=[source_claim, ref_url_claim], source_notest=[published_claim, retrieved_on_claim]) else: ref = self.wdstuff.Reference( source_test=[source_claim], source_notest=published_claim ) return ref def associate_wd_item(self, wd_item): """ Associate the data object with a Wikidata item. :param wd_item: Q-item that shall be assigned to the data object. :type wd_item: string """ if wd_item is not None: self.wd_item["wd-item"] = wd_item print("Associated WD item: ", wd_item) def add_label(self, language, text): """ Add a label in a specific language. :param language: code of language, e.g. "fi" :type language: string :param text: content of the label :type text: string """ base = self.wd_item["labels"] base.append({"language": language, "value": text}) def add_description(self, language, text): """ Add a description in a specific language. :param language: code of language, e.g. "fi" :type language: string :param text: content of the description :type text: string """ base = self.wd_item["descriptions"] base.append({"language": language, "value": text}) def construct_wd_item(self): """ Create the empty structure of the data object. This creates self.wd_item -- a dict container of all the data content of the item. """ self.wd_item = {} self.wd_item["upload"] = True self.wd_item["statements"] = [] self.wd_item["labels"] = [] self.wd_item["descriptions"] = [] self.wd_item["wd-item"] = None
class PaintingsImageBot: """Bot to enrich, and create, for items about paintings on Wikidata.""" def __init__(self, dict_generator, people_items): """Initialise the bot.""" self.people_items = people_items self.generator = dict_generator self.repo = pywikibot.Site().data_repository() self.wd = WD(self.repo, edit_summary=EDIT_SUMMARY) # Set log file out_dir = path.join(path.split(__file__)[0]) log_filename = path.join(out_dir, u'PaintingsImageBot.log') self.log = codecs.open(log_filename, 'a', 'utf-8') def run(self): """Start the robot.""" self.creators = {} for painting_data in self.generator: # isolate ids lido_data, qid, commons_file = painting_data painting_item = self.wd.QtoItemPage(qid) self.process_painting(painting_item, lido_data, commons_file) def process_painting(self, item, lido_data, commons_file): """Process a single painting.""" item.exists() # load the item obj_id_ref = self.make_obj_id_ref(lido_data.get('obj_id')) # lido_ref = self.make_lido_ref(lido_data) # make a reference object self.check_and_add_labels(item, lido_data) self.add_image_claim(item, commons_file, obj_id_ref) self.add_depicted_claim(item, lido_data, obj_id_ref) self.add_date_claim(item, lido_data, obj_id_ref) self.add_dimension_claims(item, lido_data, obj_id_ref) def add_dimension_claims(self, item, lido_data, ref): """ Add height/P2048 and width/P2049 claims. Only add non-framed measurements with just height and width. """ height_p = u'P2048' width_p = u'P2049' # diameter_p = u'P2386' # thickness_p = u'P2610' dimensions = lido_data.get('measurements').get('_') # non-framed if not dimensions or not dimensions.get('unit'): return None elif not dimensions.get('width') or not dimensions.get('height') \ or dimensions.get('depth'): # skip complicated cases for now return None elif not helpers.get_unit_q(dimensions.get('unit')): pywikibot.output(u'"%s" is an unmapped unit' % dimensions.get('unit')) return None # prepare all parts before adding claims unit = helpers.get_unit_q(dimensions.get('unit')) # unit = self.wd.QtoItemPage(unit) unit = entity_url_hack(unit) height = pywikibot.WbQuantity( dimensions.get('height'), # unit=unit, entity=unit, site=self.wd.repo) width = pywikibot.WbQuantity( dimensions.get('width'), # unit=unit, entity=unit, site=self.wd.repo) # make claims self.wd.addNewClaim(height_p, WD.Statement(height), item, ref) self.wd.addNewClaim(width_p, WD.Statement(width), item, ref) def add_date_claim(self, item, lido_data, ref): """ Add an inception/P571 claim. Only adds the claim if it's an exact year. """ prop = u'P571' creation_date = lido_data.get('creation_date') wb_date = None if not creation_date: return None # exact date if creation_date.get('earliest') and \ creation_date.get('earliest') == creation_date.get('latest'): wb_date = helpers.iso_to_WbTime(creation_date.get('earliest')) # make claim if wb_date: self.wd.addNewClaim(prop, WD.Statement(wb_date), item, ref) def add_depicted_claim(self, item, lido_data, ref): """Add a depicted/P180.""" prop = u'P180' if not lido_data.get('subjects'): return None for subject in lido_data.get('subjects'): nsid = subject.get(u'other_id') if nsid in self.people_items: person_item = self.wd.QtoItemPage(self.people_items[nsid]) self.wd.addNewClaim(prop, WD.Statement(person_item), item, ref) def add_image_claim(self, item, commons_file, ref): """ Add a image/P18 claim. Only adds it if there is None already. If one exists output to log. """ prop = u'P18' if not commons_file: return file_page = pywikibot.FilePage(pywikibot.Site('commons', 'commons'), commons_file) # check if another image is already used if prop in item.claims and \ not self.wd.has_claim(prop, file_page, item): self.log.write( u"%s already contains image claim: %s -> %s\n" % (item.title(), item.claims.get(prop)[0].getTarget().title(), file_page.title())) else: self.wd.addNewClaim(prop, WD.Statement(file_page), item, ref) def check_and_add_labels(self, item, lido_data): """Process the title field add to the item if needed.""" if not lido_data.get('title'): return for lang, value in lido_data.get('title').iteritems(): if lang == '_': continue try: self.wd.addLabelOrAlias(lang, value, item, caseSensitive=False) except pywikibot.data.api.APIError as e: self.log.write(u"%s: had an error: %s\n" % (item.title(), e)) def make_obj_id_ref(self, obj_id): """Make a reference object pointing to the objects collection page.""" uri = u'http://collection.nationalmuseum.se/eMuseumPlus?' \ u'service=ExternalInterface&module=collection&' \ u'objectId=%s&viewType=detailView' % obj_id return self.make_url_reference(uri) def make_url_reference(self, uri): """ Make a Reference object with a retrieval url and today's date. @param uri: retrieval uri/url @type uri: str @rtype: WD.Reference """ date = helpers.today_as_WbTime() ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri), source_notest=self.wd.make_simple_claim( u'P813', date)) return ref # Not implemented due to uncertainty on referencing individual xml files def make_lido_ref(self, lido_data): """ Make a Reference object for the dataset. Contains 4 parts: * P248: Stated in <the Nationalmuseum dataset> * P577: Publication date <from creation date of the document> * P854: Reference url <using the input url> * P813: Retrieval date <current date> """ exit() # P248: Nationalmuseum dataset xml_file = lido_data.get('source_file') date = helpers.today_as_WbTime() pub_date = helpers.iso_to_WbTime(u'2016-09-30') zip_url = u'https://github.com/NationalmuseumSWE/WikidataCollection/' \ u'blob/master/valid_items_transform_1677.tgz' ref = WD.Reference(source_test=[ self.wd.make_simple_claim(u'P854', zip_url), self.wd.make_simple_claim(u'P577', pub_date), self.wd.make_simple_claim(u'P?', xml_file), ], source_notest=self.wd.make_simple_claim( u'P813', date)) return ref
class PaintingsBot: """Bot to enrich, and create, for items about paintings on Wikidata.""" def __init__(self, dict_generator, painting_id_prop, cache_max_age=0): """Initiate the bot, loading files and querying WDQ. @param dict_generator: The generator for the Europeana painting objects @type dict_generator: generator (that yields Dict objects). @param painting_id_prop: the P-id of the painting-id property @type painting_id_prop: str @param cache_max_age: Max age of local wdq cache, defaults to 0 @type cache_max_age: int """ self.generator = dict_generator self.repo = pywikibot.Site().data_repository() self.commons = pywikibot.Site(u'commons', u'commons') self.wd = WD(self.repo) self.add_new = False # If new objects should be created self.skip_miniatures = True # If (new) miniatures should be skipped # Load prefixes and find allowed collections collections = set([INSTITUTION_Q]) self.mappings = helpers.load_json_file('mappings.json', force_path=__file__) self.prefix_map = self.mappings['prefix_map'] self.bad_prefix = self.mappings['bad_prefix'] for p, k in self.prefix_map.iteritems(): if k['subcol'] is not None: collections.add(k['subcol'].strip('Q')) self.collections = list(collections) # Set log file self.log = codecs.open(u'nationalmuseumSE.log', 'a', 'utf-8') # Load creator dump file self.creator_dump = helpers.load_json_file('Oku_NM_arbetskopia.json', force_path=__file__) # hard-coded anons e.g. "unknown swedish 17th century" anons = helpers.load_json_file('anons.json', force_path=__file__) # prepare WDQ painting query query = u'CLAIM[195:%s] AND CLAIM[%s]' % \ (',195:'.join(self.collections), painting_id_prop) self.painting_ids = helpers.fill_cache(painting_id_prop, queryoverride=query, cache_max_age=cache_max_age) # prepare WDQ artist query (nat_mus_id - Q_id pairs) self.artist_ids = helpers.fill_cache('P2538', cache_max_age=cache_max_age) # add anons for a in anons: self.artist_ids[a] = ANON_Q self.painting_id_prop = 'P%s' % painting_id_prop def run(self): """Start the robot.""" self.creators = {} for painting in self.generator: # isolate ids ids = painting['object']['proxies'][0]['dcIdentifier']['def'] painting_id = ids[0].replace('Inv Nr.:', '').strip('( )') obj_id = ids[1] # Museum contains several sub-collections. Only handle mapped ones if painting_id.split(' ')[0] in self.prefix_map.keys(): self.process_painting(painting, painting_id, obj_id) elif painting_id.split(' ')[0] not in self.bad_prefix: pywikibot.output(u'Skipped due to unknown collection: %s' % painting_id) def process_painting(self, painting, painting_id, obj_id): """Process a single painting. This will also create it if self.add_new is True. @param painting: information object for the painting @type painting: dict @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param obj_id: the internal id of the painting in the Nationalmuseum database. @type obj_id: str """ uri = u'http://collection.nationalmuseum.se/eMuseumPlus?service=' \ u'ExternalInterface&module=collection&objectId=%s&viewType=' \ u'detailView' % obj_id europeana_url = u'http://europeana.eu/portal/record%s.html' % \ painting['object']['about'] painting_item = None # newclaims = [] if painting_id in self.painting_ids: painting_item = self.create_existing_painting( painting, painting_id) elif self.add_new and not (self.skip_miniatures and PaintingsBot.is_miniature(painting)): # if objection collection is allowed and # unless it is a miniature and we are skipping those painting_item = self.create_new_painting(painting, painting_id, europeana_url, uri) # add new claims if painting_item and painting_item.exists(): data = painting_item.get(force=True) claims = data.get('claims') # add natmus id claim self.add_natmus_id(painting_item, obj_id, uri) # add inventory number with collection self.add_inventory_and_collection_claim(painting_item, painting_id, painting, uri) # Instance_of if u'P31' not in claims: self.add_instanceof_claim(painting_item, painting_id, painting) # title (as claim) # commented out as the titles in Europeana are not reliable # if u'P1476' not in claims: # self.add_title_claim(painting_item, painting) # Europeana_ID self.add_europeana_claim(painting_item, painting) # Check for potential images to add, if none is present if u'P18' not in claims: self.add_image_claim(painting_item, uri) # creator through Nat_mus_database dump self.add_natmus_creators(painting_item, obj_id, uri) # creator IFF through dbpedia # if u'P170' not in claims: # self.add_dbpedia_creator(painting_item, painting) def add_title_claim(self, painting_item, painting): """Add a title/P1476 claim based on dcTitle. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting: information object for the painting @type painting: dict """ dc_title = painting['object']['proxies'][0]['dcTitle'] titles = [] for lang, title in dc_title.iteritems(): titles.append(pywikibot.WbMonolingualText(title[0], lang)) for title in titles: self.wd.addNewClaim(u'P1476', WD.Statement(title), painting_item, self.make_europeana_reference(painting)) def add_locatedin_claim(self, painting_item, painting_id, painting): """Add a located_in/P276 claim based on sub-collection. No longer used as sub-collection does not match actual placing. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param painting: information object for the painting @type painting: dict """ place = self.prefix_map[painting_id.split(' ')[0]]['place'] place_item = self.wd.QtoItemPage(place) self.wd.addNewClaim(u'P276', WD.Statement(place_item), painting_item, self.make_europeana_reference(painting)) def add_dbpedia_creator(self, painting_item, painting): """Add a Creator/P170 claim through a dbpedia look-up. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting: information object for the painting @type painting: dict """ creator_id = None try: db_creator = painting['object']['proxies'][1]['dcCreator']['def'] if len(db_creator) == 1: # skip anything more complex than one creator db_creator = db_creator[0].strip() if db_creator.startswith('http://dbpedia.org/resource/'): if db_creator not in self.creators.keys(): self.creators[db_creator] = \ helpers.dbpedia_2_wikidata(db_creator) creator_id = self.creators[db_creator] except KeyError: return if creator_id: self.set_creator(painting_item, self.make_europeana_reference(painting), creator_q=creator_id) def add_image_claim(self, painting_item, uri): """Add a image/P18 claim if exactly one image is found on Commons. Uses the nationalmuseum.se uri to search for matches on Commons. Adds a claim only if a unique hit is found. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param uri: reference url on nationalmuseum.se @type uri: str """ images = self.file_from_external_link(uri) if len(images) > 1: # for now don't want to choose the appropriate one pywikibot.output('Found multiple matching images for %s' % painting_item) for image in images: pywikibot.output(u'\t%s' % image) elif len(images) == 1: self.wd.addNewClaim(u'P18', WD.Statement(images[0]), painting_item, self.make_commons_reference()) def add_europeana_claim(self, painting_item, painting): """Add a Europeana ID/P727 claim. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting: information object for the painting @type painting: dict """ europeana_prop = u'P727' europeana_id = painting['object']['about'].lstrip('/') # abort if conflicting info if europeana_prop in painting_item.claims and \ not self.wd.has_claim(europeana_prop, europeana_id, painting_item): pywikibot.output(u'%s has conflicting %s. Expected %s' % (painting_item, europeana_prop, europeana_id)) return self.wd.addNewClaim(europeana_prop, WD.Statement(europeana_id), painting_item, self.make_europeana_reference(painting)) def add_instanceof_claim(self, painting_item, painting_id, painting): """Add an instance_of/P31 claim. Instance_of is always painting or icon while working on the paintings collection. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param painting: information object for the painting @type painting: dict """ dcformat_item = self.wd.QtoItemPage(PAINTING_Q) # painting if painting_id.split(' ')[0] == 'NMI': dcformat_item = self.wd.QtoItemPage(ICON_Q) # icon self.wd.addNewClaim(u'P31', WD.Statement(dcformat_item), painting_item, self.make_europeana_reference(painting)) @staticmethod def is_miniature(painting): """Determine if the painting is a miniature. @param painting: information object for the painting @type painting: dict @rtype bool """ for concept in painting['object']['concepts']: if concept[u'about'] == MINIATURE_URL: # pywikibot.output(u'Skipping miniature') return True return False def create_existing_painting(self, painting, painting_id): """Add base info to an existing paining. Adds the same info as would have been added had it been created with create_new_painting() @param painting: information object for the painting @type painting: dict @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @return: the created painting item @rtype: pywikibot.ItemPage """ painting_item = self.wd.QtoItemPage(self.painting_ids.get(painting_id)) # check label data = painting_item.get() labels = make_labels(painting) new_labels = find_new_values(data, labels, 'labels') if new_labels: pywikibot.output('Adding label to %s' % painting_item.title()) painting_item.editLabels(new_labels) # check description descriptions = make_descriptions(painting) if descriptions: new_descr = find_new_values(data, descriptions, 'descriptions') if new_descr: pywikibot.output('Adding description to %s' % painting_item.title()) painting_item.editDescriptions(new_descr) return painting_item def create_new_painting(self, painting, painting_id, europeana_url, uri): """Create a new painting item and return it. @param painting: information object for the painting @type painting: dict @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param europeana_url: reference url for Europeana @type europeana_url: str @param uri: reference uri at nationalmuseum.se @type uri: str @return: the created painting item @rtype: pywikibot.ItemPage """ data = {'labels': {}, 'descriptions': {}} data['labels'] = make_labels(painting) data['descriptions'] = make_descriptions(painting) if not data['descriptions']: return # print data # create new empty item and request Q-number summary = u'%s: Creating new item with data from %s' % (EDIT_SUMMARY, europeana_url) painting_item = None try: painting_item = self.wd.make_new_item(data, summary) except pywikibot.data.api.APIError as e: if e.code == u'modification-failed': # disambiguate and try again for lang, content in data['descriptions'].iteritems(): disambiguation = content['value'] + u' (%s)' % painting_id data['descriptions'][lang]['value'] = disambiguation try: painting_item = self.wd.make_new_item(data, summary) except pywikibot.data.api.APIError as e: if e.code == u'modification-failed': pywikibot.output(u'modification-failed error: ' u'skipping %s' % uri) return else: raise pywikibot.Error(u'Error during item creation: ' u'%s' % e) else: raise pywikibot.Error(u'Error during item creation: %s' % e) return painting_item def add_natmus_id(self, painting_item, obj_id, uri): """Add a natmus_painting_id/P2539 claim. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param obj_id: the nationalmuseum database id @type obj_id: str @param uri: reference url on nationalmuseum.se @type uri: str """ self.wd.addNewClaim(u'P2539', WD.Statement(obj_id), painting_item, self.make_url_reference(uri)) def add_natmus_creators(self, painting_item, obj_id, uri): """Add creator/P170 claim(s) based on the database dump info. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param obj_id: the nationalmuseum database id @type obj_id: str @param uri: reference url on nationalmuseum.se @type uri: str """ if obj_id not in self.creator_dump.keys(): return # each artwork may have multiple artists, # which must all be on wikidata for artist_id in self.creator_dump[obj_id].keys(): if artist_id not in self.artist_ids.keys(): self.logger('Artist not found on wikidata: %s' % artist_id) return dump_entry = self.creator_dump[obj_id] if len(dump_entry) == 1: artist_entry = dump_entry.iteritems().next() self.add_singel_natmus_creator(painting_item, artist_entry, uri) elif len(dump_entry) == 2: # self.add_double_natmus_creator(painting_item, dump_entry, uri) # skipping until duplication issue has been solved pass else: # for now avoid any entries with more creators return def add_singel_natmus_creator(self, painting_item, artist, uri): u"""Add a simple creator/P170 claim based on the database dump info. Handles cases with only a single identified creator. Either * Known creator * Unknown/uncertain creator somehow related to a known person where creator is someone whose function is in artist_labels. For Forgery/After work by the bot needs to be aware of both parties, and both must exist on Wikidata @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param artist: the dump entry for the artist @type artist: tuple (artist_id, artist_info) @param uri: reference url on nationalmuseum.se @type uri: str """ anonymous_combos = { u'Tillskriven': 'P1773', u'Hennes ateljé': 'P1774', u'Hans ateljé': 'P1774', u'Hennes skola': 'P1780', u'Hans skola': 'P1780', u'Hennes art': 'P1777', u'Hans art': 'P1777', } artist_labels = (u'Konstnär', u'Mästare', u'Utförd av') artist_id, artist_info = artist artist_q = self.artist_ids[artist_id] if artist_info.get('OkuBeschreibungS') or \ artist_info.get('OkuValidierungS'): # this always indicates some special case which we cannot handle # for now return if artist_info.get('OkuFunktionS') and \ artist_info.get('OkuFunktionS') in artist_labels: if len(artist_info.keys()) == 1: # i.e. all other are empty self.set_creator(painting_item, self.make_url_reference(uri), creator_q=artist_q) elif artist_info.get('OkuArtS') in anonymous_combos.keys() and \ len(artist_info.keys()) == 2: # anonymous but attributed to the artist related_info = { 'P': anonymous_combos[artist_info.get('OkuArtS')], 'itis': self.wd.QtoItemPage(artist_q) } self.set_creator(painting_item, self.make_url_reference(uri), related_info=related_info) elif not artist_info.get('OkuFunktionS') and artist_id == '1': # this is the special case of a completly unknown creator self.set_creator(painting_item, self.make_url_reference(uri)) def add_double_natmus_creator(self, painting_item, artists, uri): u"""Add a comlex creator/P170 claim based on the database dump info. Handles cases with two identified creators in a relation along the lines of "Painting/Forgery by X after a work by Y" The logic is: OkuFunktionS in derived_combos -> OkuKueID = creator of original OkuFunktionS in artist_labels -> OkuKueID = creator of derivative @param artists: the dump entries for the artists @type artists: dict of {artist_id: artist_info} @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param uri: reference url on nationalmuseum.se @type uri: str """ derived_combos = { u'Kopia efter': 'P1877', u'Efter': 'P1877', u'Förfalskning efter': 'P1778', } artist_labels = (u'Konstnär', u'Utförd av') # set up targets original = None derivative = None relation = None for artist in artists.iteritems(): artist_id, artist_info = artist if artist_info.get('OkuBeschreibungS') or \ artist_info.get('OkuValidierungS'): # this indicates some special case which we cannot handle # for now return if artist_info.get('OkuFunktionS') and \ len(artist_info.keys()) == 1: # cannot deal with OkuArtS if artist_info.get('OkuFunktionS') in artist_labels: derivative = artist elif artist_info.get('OkuFunktionS') in derived_combos.keys(): original = artist relation = derived_combos[artist_info.get('OkuFunktionS')] # verify that both roles were filled if any(creator is None for creator in (original, derivative)): return # construct info and set original_q = self.artist_ids[original[0]] derivative_q = self.artist_ids[derivative[0]] related_info = {'P': relation, 'itis': self.wd.QtoItemPage(original_q)} self.set_creator(painting_item, self.make_url_reference(uri), creator_q=derivative_q, related_info=related_info) def set_creator(self, target_item, reference, creator_q=None, related_info=None): """Set a creator/P170 claim for a creator or creator combo. Allows for simple claims as well as more complex "in the manner of" etc. @param target_item: item to which claim is added @type target_item: pywikibot.ItemPage @param reference: the reference for the statment @type reference: WD.Reference @param related_info: related info as a dict with P/itis pairs @type related_info: dict @param creator_q: the Q-id of the creator @type creator_q: str """ creator_q = creator_q or ANON_Q creator_statement = WD.Statement(self.wd.QtoItemPage(creator_q)) # set any related qualifiers if related_info: creator_statement.addQualifier( WD.Qualifier(P=related_info['P'], itis=related_info['itis'])) # set claim self.wd.addNewClaim(u'P170', creator_statement, target_item, reference) def add_inventory_and_collection_claim(self, painting_item, painting_id, painting, uri): """Add an inventory_no, with qualifier, and a collection/P195 claim. This will add the collection qualifier to any matching claim missing it. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param painting: information object for the painting @type painting: dict @param uri: reference url on nationalmuseum.se @type uri: str """ nationalmuseum_item = self.wd.QtoItemPage(INSTITUTION_Q) collection_p = u'P195' # abort if conflicting info if self.painting_id_prop in painting_item.claims and \ not self.wd.has_claim(self.painting_id_prop, painting_id, painting_item): pywikibot.output( u'%s has conflicting inv. no (%s). Expected %s' % (painting_item, self.painting_id_prop, painting_id)) return # add inventory number with collection self.wd.addNewClaim( self.painting_id_prop, WD.Statement(painting_id).addQualifier(WD.Qualifier( P=collection_p, itis=nationalmuseum_item), force=True), painting_item, self.make_url_reference(uri)) # add collection (or subcollection) subcol = self.prefix_map[painting_id.split(' ')[0]]['subcol'] collection_item = nationalmuseum_item if subcol is not None: collection_item = self.wd.QtoItemPage(subcol) self.wd.addNewClaim(collection_p, WD.Statement(collection_item), painting_item, self.make_europeana_reference(painting)) def make_europeana_reference(self, painting): """Make a Reference object with a Europeana retrieval url and today's date. @param uri: retrieval uri/url @type uri: str @rtype: WD.Reference """ europeana_url = u'http://europeana.eu/portal/record%s.html' % \ painting['object']['about'] return self.make_url_reference(europeana_url) def make_url_reference(self, uri): """Make a Reference object with a retrieval url and today's date. @param uri: retrieval uri/url @type uri: str @rtype: WD.Reference """ date = helpers.today_as_WbTime() ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri), source_notest=self.wd.make_simple_claim( u'P813', date)) return ref def make_commons_reference(self): """Make a Reference object saying imported from Wikimedia Commons.""" commons_item = self.wd.QtoItemPage(COMMONS_Q) ref = WD.Reference(source_test=self.wd.make_simple_claim( u'P143', commons_item)) # imported from return ref def file_from_external_link(self, uri): """Identify files from a Nationalmuseum uri. Hits are any files containing a link to the eMuseumPlus uri. @param uri: reference url on nationalmuseum.se @type uri: str @return: matching images @rtype: list """ images = [] uri = uri.split('://')[1] objgen = pagegenerators.LinksearchPageGenerator(uri, namespaces=[6], site=self.commons) for page in objgen: images.append(pywikibot.FilePage(self.commons, page.title())) # I have no clue how the above results in duplicates, but it does so... images = list(set(images)) return images def most_missed_creators(self, cache_max_age=0): """Produce list of most frequent, but unlinked, creators. Query WDQ for all objects in the collection missing an artist then put together a top-list for most desired creator """ expected_items = [] query = u'CLAIM[195:%s] AND NOCLAIM[170]' % \ ',195:'.join(self.collections) # collection wd_queryset = wdquery.QuerySet(query) wd_query = wdquery.WikidataQuery(cacheMaxAge=cache_max_age) data = wd_query.query(wd_queryset) if data.get('status').get('error') == 'OK': expected_items = data.get('items') creator_dict = {} counter = 0 for q_val in expected_items: q_item = self.wd.QtoItemPage(q_val) data = q_item.get() claims = data.get('claims') if u'P170' in claims: continue descr = data.get('descriptions').get('en') if descr and descr.startswith(u'painting by '): creator = descr[len(u'painting by '):] if '(' in creator: # to get rid of disambiguation addition creator = creator[:creator.find('(')].strip() if creator in creator_dict.keys(): creator_dict[creator] += 1 else: creator_dict[creator] = 1 counter += 1 pywikibot.output(u'Found %d mentions of %d creators' % (counter, len(creator_dict))) # output f = codecs.open(u'creatorHitlist.csv', 'w', 'utf-8') for k, v in creator_dict.iteritems(): f.write(u'%d|%s\n' % (v, k)) f.close() def logger(self, text): """Append text to logfile. @param text: text to output @type text: str """ self.log.write(u'%s\n' % text) self.log.flush() # because shit tends to crash