class PaintingsImageBot: """Bot to enrich, and create, for items about paintings on Wikidata.""" def __init__(self, dict_generator, people_items): """Initialise the bot.""" self.people_items = people_items self.generator = dict_generator self.repo = pywikibot.Site().data_repository() self.wd = WD(self.repo, edit_summary=EDIT_SUMMARY) # Set log file out_dir = path.join(path.split(__file__)[0]) log_filename = path.join(out_dir, u'PaintingsImageBot.log') self.log = codecs.open(log_filename, 'a', 'utf-8') def run(self): """Start the robot.""" self.creators = {} for painting_data in self.generator: # isolate ids lido_data, qid, commons_file = painting_data painting_item = self.wd.QtoItemPage(qid) self.process_painting(painting_item, lido_data, commons_file) def process_painting(self, item, lido_data, commons_file): """Process a single painting.""" item.exists() # load the item obj_id_ref = self.make_obj_id_ref(lido_data.get('obj_id')) # lido_ref = self.make_lido_ref(lido_data) # make a reference object self.check_and_add_labels(item, lido_data) self.add_image_claim(item, commons_file, obj_id_ref) self.add_depicted_claim(item, lido_data, obj_id_ref) self.add_date_claim(item, lido_data, obj_id_ref) self.add_dimension_claims(item, lido_data, obj_id_ref) def add_dimension_claims(self, item, lido_data, ref): """ Add height/P2048 and width/P2049 claims. Only add non-framed measurements with just height and width. """ height_p = u'P2048' width_p = u'P2049' # diameter_p = u'P2386' # thickness_p = u'P2610' dimensions = lido_data.get('measurements').get('_') # non-framed if not dimensions or not dimensions.get('unit'): return None elif not dimensions.get('width') or not dimensions.get('height') \ or dimensions.get('depth'): # skip complicated cases for now return None elif not helpers.get_unit_q(dimensions.get('unit')): pywikibot.output(u'"%s" is an unmapped unit' % dimensions.get('unit')) return None # prepare all parts before adding claims unit = helpers.get_unit_q(dimensions.get('unit')) # unit = self.wd.QtoItemPage(unit) unit = entity_url_hack(unit) height = pywikibot.WbQuantity( dimensions.get('height'), # unit=unit, entity=unit, site=self.wd.repo) width = pywikibot.WbQuantity( dimensions.get('width'), # unit=unit, entity=unit, site=self.wd.repo) # make claims self.wd.addNewClaim(height_p, WD.Statement(height), item, ref) self.wd.addNewClaim(width_p, WD.Statement(width), item, ref) def add_date_claim(self, item, lido_data, ref): """ Add an inception/P571 claim. Only adds the claim if it's an exact year. """ prop = u'P571' creation_date = lido_data.get('creation_date') wb_date = None if not creation_date: return None # exact date if creation_date.get('earliest') and \ creation_date.get('earliest') == creation_date.get('latest'): wb_date = helpers.iso_to_WbTime(creation_date.get('earliest')) # make claim if wb_date: self.wd.addNewClaim(prop, WD.Statement(wb_date), item, ref) def add_depicted_claim(self, item, lido_data, ref): """Add a depicted/P180.""" prop = u'P180' if not lido_data.get('subjects'): return None for subject in lido_data.get('subjects'): nsid = subject.get(u'other_id') if nsid in self.people_items: person_item = self.wd.QtoItemPage(self.people_items[nsid]) self.wd.addNewClaim(prop, WD.Statement(person_item), item, ref) def add_image_claim(self, item, commons_file, ref): """ Add a image/P18 claim. Only adds it if there is None already. If one exists output to log. """ prop = u'P18' if not commons_file: return file_page = pywikibot.FilePage(pywikibot.Site('commons', 'commons'), commons_file) # check if another image is already used if prop in item.claims and \ not self.wd.has_claim(prop, file_page, item): self.log.write( u"%s already contains image claim: %s -> %s\n" % (item.title(), item.claims.get(prop)[0].getTarget().title(), file_page.title())) else: self.wd.addNewClaim(prop, WD.Statement(file_page), item, ref) def check_and_add_labels(self, item, lido_data): """Process the title field add to the item if needed.""" if not lido_data.get('title'): return for lang, value in lido_data.get('title').iteritems(): if lang == '_': continue try: self.wd.addLabelOrAlias(lang, value, item, caseSensitive=False) except pywikibot.data.api.APIError as e: self.log.write(u"%s: had an error: %s\n" % (item.title(), e)) def make_obj_id_ref(self, obj_id): """Make a reference object pointing to the objects collection page.""" uri = u'http://collection.nationalmuseum.se/eMuseumPlus?' \ u'service=ExternalInterface&module=collection&' \ u'objectId=%s&viewType=detailView' % obj_id return self.make_url_reference(uri) def make_url_reference(self, uri): """ Make a Reference object with a retrieval url and today's date. @param uri: retrieval uri/url @type uri: str @rtype: WD.Reference """ date = helpers.today_as_WbTime() ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri), source_notest=self.wd.make_simple_claim( u'P813', date)) return ref # Not implemented due to uncertainty on referencing individual xml files def make_lido_ref(self, lido_data): """ Make a Reference object for the dataset. Contains 4 parts: * P248: Stated in <the Nationalmuseum dataset> * P577: Publication date <from creation date of the document> * P854: Reference url <using the input url> * P813: Retrieval date <current date> """ exit() # P248: Nationalmuseum dataset xml_file = lido_data.get('source_file') date = helpers.today_as_WbTime() pub_date = helpers.iso_to_WbTime(u'2016-09-30') zip_url = u'https://github.com/NationalmuseumSWE/WikidataCollection/' \ u'blob/master/valid_items_transform_1677.tgz' ref = WD.Reference(source_test=[ self.wd.make_simple_claim(u'P854', zip_url), self.wd.make_simple_claim(u'P577', pub_date), self.wd.make_simple_claim(u'P?', xml_file), ], source_notest=self.wd.make_simple_claim( u'P813', date)) return ref
class KulturnavBot(object): """Bot to enrich and create information on Wikidata from KulturNav info.""" EDIT_SUMMARY = 'import using #Kulturnav data' KULTURNAV_ID_P = '1248' GEONAMES_ID_P = '1566' SWE_KOMMUNKOD_P = '525' SWE_COUNTYKOD_P = '507' PLACE_P = '276' TIME_P = '585' # date DATASET_Q = None DISAMBIG_Q = '4167410' IS_A_P = '31' CATALOG_P = '972' DATASET_ID = None ENTITY_TYPE = None MAP_TAG = None COUNTRIES = [] # a list of country Q's ADMIN_UNITS = [] # a list of municipality+county Q's locations = {} # a dict of uuid to wikidata location matches current_uuid = '' # for debugging def __init__(self, dictGenerator, cache_max_age, verbose=False): """ Initialise the bot. Arguments: * generator - A generator that yields Dict objects. """ self.generator = dictGenerator self.repo = pywikibot.Site().data_repository() self.cutoff = None self.verbose = verbose self.require_wikidata = True self.cache_max_age = cache_max_age # trigger wdq query self.itemIds = helpers.fill_cache(self.KULTURNAV_ID_P, cache_max_age=cache_max_age) # set up WikidataStuff instance self.wd = WD(self.repo, self.EDIT_SUMMARY) # load lists self.COUNTRIES = wdqsLookup.wdq_to_wdqs(u'TREE[6256][][31]') self.ADMIN_UNITS = wdqsLookup.wdq_to_wdqs(u'TREE[15284][][31]') @classmethod def set_variables(cls, dataset_q=None, dataset_id=None, entity_type=None, map_tag=None, edit_summary=None): """Override any class variables. Used when command line arguments affect which type of run to do. @param dataset_q: the Q-id of the dataset @type dataset_q: str @param dataset_id: the uuid of the dataset @type dataset_id: str @param entity_type: the entity type to provide for the search API @type entity_type: str @param map_tag: the map_tag to use in the search API to find wikidata matches @type map_tag: str @param edit_summary: the edit_summary to use @type edit_summary: str """ cls.DATASET_Q = dataset_q or cls.DATASET_Q cls.DATASET_ID = dataset_id or cls.DATASET_ID cls.ENTITY_TYPE = entity_type or cls.ENTITY_TYPE cls.MAP_TAG = map_tag or cls.MAP_TAG cls.EDIT_SUMMARY = edit_summary or cls.EDIT_SUMMARY def run(self): """Start the robot.""" raise NotImplementedError("run() is not implemented in the base bot.") def runLayout(self, datasetRules, datasetProtoclaims, datasetSanityTest, label, shuffle): """ Execute the basic layout of a run. It should be called for a dataset-specific run which sets the parameters. param datasetRules: a dict of additional Rules or values to look for param datasetProtoclaims: a function for populating protoclaims param datasetSanityTest: a function which must return true for results to be written to Wikidata param label: the key in values to be used for label/alias. set to None to skip addNames() param shuffle: whether name/label/alias is shuffled or not i.e. if name = last, first """ count = 0 for hit in self.generator: # print count, self.cutoff if self.cutoff and count >= self.cutoff: break # some type of feedback if count % 100 == 0 and count > 0: pywikibot.output('%d entries handled...' % count) # Required rules/values to search for rules = { u'identifier': None, u'modified': None, u'seeAlso': None, u'sameAs': None, u'exactMatch': None, # not expected u'wikidata': None, u'libris-id': None, u'viaf-id': None, u'getty_aat': None, u'ulan': None } rules.update(datasetRules) # put together empty dict of values then populate values = {} for k in rules.keys(): values[k] = None if not self.populateValues(values, rules, hit): # continue with next hit if problem was encounterd continue # find the matching wikidata item hitItem = self.wikidataMatch(values) self.current_uuid = values['identifier'] # @todo: self.current_protoclaims # allows these to be accessed more easily # convert values to potential claims protoclaims = datasetProtoclaims(self, values) self.make_base_protoclaims(values, protoclaims) # output info for testing if self.verbose: pywikibot.output(values) pywikibot.output(protoclaims) pywikibot.output(hitItem) # Add information if a match was found if hitItem and hitItem.exists(): # if redirect then get target instead # make sure it passes the sanityTests if not self.sanityTest(hitItem): continue if not datasetSanityTest(self, hitItem): continue # add name as label/alias if label is not None: self.addNames(values[label], hitItem, shuffle=shuffle) # get the "last modified" timestamp and construct a Reference date = helpers.iso_to_WbTime(values[u'modified']) ref = self.make_ref(date) # add each property (if new) and source it self.addProperties(protoclaims, hitItem, ref) # allow for limited runs count += 1 # done pywikibot.output(u'Handled %d entries' % count) def populateValues(self, values, rules, hit): """ Populate values and check results given a hit. Given a list of values and a kulturnav hit, populate the values and check if result is problem free. @todo: raise Error instead of using problemFree solution param values: dict with keys and every value as None param rules: a dict with keys and values either: None: the exakt key is present in hit and its value is wanted a Rule: acording to the class above param hit: a kulturnav entry return bool problemFree """ ids = {} problemFree = True for entries in hit[u'@graph']: # populate ids for viaId rules if '@id' in entries.keys(): if entries['@id'] in ids.keys(): pywikibot.output('Non-unique viaID key: \n%s\n%s' % (entries, ids[entries['@id']])) ids[entries['@id']] = entries for entries in hit[u'@graph']: # handle rules for key, rule in rules.iteritems(): val = None if rule is None: if key in entries.keys(): val = entries[key] elif isinstance(rule, Rule): val = rule.resolve(entries, ids) # test and register found value if val is not None: if values[key] is None: values[key] = val else: pywikibot.output(u'duplicate entries for %s' % key) problemFree = False # the minimum which must have been identified if values[u'identifier'] is None: raise pywikibot.Error(u'Could not isolate the identifier from the ' u'KulturNav object! JSON layout must have ' u'changed. Crashing!') # dig into sameAs/exactMatch and seeAlso KulturnavBot.set_sameas_values(values) # only look at seeAlso if we found no Wikidata link and require one if self.require_wikidata and \ (not values[u'wikidata'] and values[u'seeAlso']): values[u'seeAlso'] = helpers.listify(values[u'seeAlso']) for sa in values[u'seeAlso']: if u'wikipedia' in sa: pywikibot.output(u'Found a Wikipedia link but no ' u'Wikidata link: %s %s' % (sa, values[u'identifier'])) problemFree = False if not problemFree: pywikibot.output(u'Found an issue with %s (%s), skipping' % (values['identifier'], values['wikidata'])) return problemFree def sanityTest(self, hitItem): """ Execute generic sanitytest which should be run independent on dataset. return bool """ return self.withoutClaimTest(hitItem, self.IS_A_P, self.DISAMBIG_Q, u'disambiguation page') def withoutClaimTest(self, hitItem, P, Q, descr): """ Execute base test that an item does not contain a particular claim. param hitItem: item to check param P: the property to look for param Q: the Q claim to look for param descr: a descriptive text return bool """ P = u'P%s' % P.lstrip('P') testItem = self.wd.QtoItemPage(Q) if self.wd.has_claim(P, testItem, hitItem): pywikibot.output(u'%s is matched to %s, ' u'FIXIT' % (hitItem.title(), descr)) return False else: return True def withClaimTest(self, hitItem, P, Q, descr, orNone=True): """ Execute base test that an item contains a certain claim. param hitItem: item to check param P: the property to look for param Q: (list) of Q claim to look for param descr: a descriptive text param orNone: if complete absence of the Property is also ok return bool """ P = u'P%s' % P.lstrip('P') Q = helpers.listify(Q) testItems = [] for q in Q: testItems.append(self.wd.QtoItemPage(q)) # check claims if P in hitItem.claims.keys(): for testItem in testItems: if self.wd.has_claim(P, testItem, hitItem): return True else: pywikibot.output(u'%s is identified as something other ' u'than a %s. Check!' % (hitItem.title(), descr)) return False elif orNone: # no P claim return True @staticmethod def set_sameas_values(values): """Isolate external identifiers through sameAs and exactMatch. @param values: All extracted values @type values: dict """ # merge sameAs and exactMatch match = helpers.bundle_values( [values[u'sameAs'], values[u'exactMatch']]) or [] # dig into sameAs/exactMatch and seeAlso for sa in match: if u'wikidata' in sa: values[u'wikidata'] = sa.split('/')[-1] elif u'libris-id' in values.keys() and \ u'libris.kb.se/auth/' in sa: values[u'libris-id'] = sa.split('/')[-1] elif u'viaf-id' in values.keys() and \ u'viaf.org/viaf/' in sa: values[u'viaf-id'] = sa.split('/')[-1] elif u'getty_aat' in values.keys() and \ u'vocab.getty.edu/aat/' in sa: values[u'getty_aat'] = sa.split('/')[-1] elif u'ulan' in values.keys() and \ u'vocab.getty.edu/ulan/' in sa: values[u'ulan'] = sa.split('/')[-1] def make_base_protoclaims(self, values, protoclaims): """Construct the protoclaims common for all KulturnavBots. Adds the claim to the protoclaims dict. @param values: the values extracted using the rules @type values: dict @param protoclaims: the dict of claims to add @type protoclaims: dict """ # kulturnav protoclaim incl. qualifier protoclaims[u'P%s' % self.KULTURNAV_ID_P] = \ WD.Statement(values[u'identifier']).addQualifier( WD.Qualifier( P=self.CATALOG_P, itis=self.wd.QtoItemPage(self.DATASET_Q)), force=True) # authority control protoclaims if values.get(u'libris-id'): protoclaims[u'P906'] = WD.Statement(values[u'libris-id']) if values.get(u'viaf-id'): protoclaims[u'P214'] = WD.Statement(values[u'viaf-id']) if values.get(u'getty_aat'): protoclaims[u'P1014'] = WD.Statement(values[u'getty_aat']) if values.get(u'ulan'): protoclaims[u'P245'] = WD.Statement(values[u'ulan']) def wikidataMatch(self, values): """ Find the matching wikidata item. Checks Wikidata first, then kulturNav. return ItemPage|None the matching item """ if values[u'identifier'] in self.itemIds: hitItemTitle = u'Q%s' % \ self.itemIds.get(values[u'identifier']) if not values[u'wikidata'] and not self.require_wikidata: # i.e. uuid has been supplied manually and exists on wikidata pass elif values[u'wikidata'] != hitItemTitle: # this may be caused by either being a redirect wd = self.wd.QtoItemPage(values[u'wikidata']) wi = self.wd.QtoItemPage(hitItemTitle) if wd.isRedirectPage() and wd.getRedirectTarget() == wi: pass elif wi.isRedirectPage() and wi.getRedirectTarget() == wd: pass else: pywikibot.output(u'Identifier missmatch (skipping): ' u'%s, %s, %s' % (values[u'identifier'], values[u'wikidata'], hitItemTitle)) return None elif values[u'wikidata']: hitItemTitle = values[u'wikidata'] else: # no match found return None # create ItemPage, bypassing any redirect hitItem = self.wd.bypassRedirect(self.wd.QtoItemPage(hitItemTitle)) # in case of redirect values[u'wikidata'] = hitItem.title() return hitItem def addNames(self, names, hitItem, shuffle=False): """ Prepare a nameObj or a list of such for add_label_or_alias(). param shuffle: bool if name order is last, first then this creates a local rearranged copy """ if names: if shuffle: namelist = [] if isinstance(names, dict): s = KulturnavBot.shuffle_names(names) if s is not None: namelist.append(s) elif isinstance(names, list): for n in names: s = KulturnavBot.shuffle_names(n) if s is not None: namelist.append(s) else: pywikibot.output(u'unexpectedly formatted name' u'object: %s' % names) if namelist: self.add_label_or_alias(namelist, hitItem) else: self.add_label_or_alias(names, hitItem) def addProperties(self, protoclaims, hitItem, ref): """ Add each property (if new) and source it. param protoclaims: a dict of claims with a key: Prop number val: Statement|list of Statments param hititem: the target entity param ref: WD.Reference """ for pcprop, pcvalue in protoclaims.iteritems(): if pcvalue: if isinstance(pcvalue, list): pcvalue = set(pcvalue) # eliminate potential duplicates for val in pcvalue: # check if None or a Statement(None) if (val is not None) and (not val.isNone()): self.wd.addNewClaim(pcprop, val, hitItem, ref) # reload item so that next call is aware of changes hitItem = self.wd.QtoItemPage(hitItem.title()) hitItem.exists() elif not pcvalue.isNone(): self.wd.addNewClaim(pcprop, pcvalue, hitItem, ref) # KulturNav specific functions def dbpedia2Wikidata(self, item): """ Convert dbpedia reference to the equivalent Wikidata item, if present. param item: dict with @language, @value keys return pywikibot.ItemPage|None """ if KulturnavBot.foobar(item): return if not all(x in item.keys() for x in (u'@value', u'@language')): pywikibot.output(u'invalid dbpedia entry: %s' % item) exit(1) # any site will work, this is just an example site = pywikibot.Site(item[u'@language'], 'wikipedia') page = pywikibot.Page(site, item[u'@value']) if page.properties().get(u'wikibase_item'): qNo = page.properties()[u'wikibase_item'] return self.wd.QtoItemPage(qNo) def db_gender(self, value): """Match gender values to items. Note that this returns a Statment unlike most other functions @param value: The gender value @type value: str @return: The gender item as a statement @rtype: WD.Statement or None """ known = { u'male': u'Q6581097', u'female': u'Q6581072', u'unknown': u'somevalue' } # a special case if value not in known.keys(): pywikibot.output(u'invalid gender entry: %s' % value) return if known[value] in (u'somevalue', u'novalue'): return WD.Statement(known[value], special=True) else: return WD.Statement(self.wd.QtoItemPage(known[value])) def db_name(self, name_obj, typ, limit=75): """Check if there is an item matching the name. A wrapper for helpers.match_name() to send it the relevant part of a nameObj. @param nameObj: {'@language': 'xx', '@value': 'xxx'} @type nameObj: dict @param typ: The name type (either 'lastName' or 'firstName') @type typ: str @param limit: Number of hits before skipping (defaults to 75, ignored if onLabs) @type limit: int @return: A matching item, if any @rtype: pywikibot.ItemPage, or None """ return helpers.match_name(name_obj['@value'], typ, self.wd, limit=limit) def location2Wikidata(self, uuid): """ Get location from kulturNav uuid. Given a kulturNav uuid or url this checks if that contains a GeoNames url and, if so, connects that to a Wikidata object using the GEONAMES_ID_P property (if any). NOTE that the WDQ results may be outdated return pywikibot.ItemPage|None """ # Check if uuid if not self.is_uuid(uuid): return None # Convert url to uuid if uuid.startswith(u'http://kulturnav.org'): uuid = uuid.split('/')[-1] # Check if already stored if uuid in self.locations.keys(): if self.locations[uuid] is None: return None else: qNo = u'Q%d' % self.locations[uuid] return self.wd.QtoItemPage(qNo) # retrieve various sources # @todo: this can be more streamlined by including wdq query for geonames # in that method. Possibly sharing the same "look-up and filter" # mechanism for both. # and then using self.locations[uuid] = self.extract... (which # returns qid or None) then (after both have been processed) # checking self.locations.get(uuid) before # making an ItemPage # # @todo: change self.locations and self.ADMIN_UNITS to include Q prefix (and thus have the methods return that) geo_sources = self.get_geo_sources(uuid) kulturarvsdata = self.extract_kulturarvsdata_location(geo_sources) if kulturarvsdata: self.locations[uuid] = kulturarvsdata qNo = u'Q%d' % self.locations[uuid] return self.wd.QtoItemPage(qNo) # retrieve hit through geonames-lookup geonames = KulturnavBot.extract_geonames(geo_sources) if geonames: # store as a resolved hit, in case wdq yields nothing self.locations[uuid] = None wdqQuery = u'STRING[%s:"%s"]' % (self.GEONAMES_ID_P, geonames) wdqResult = wdqsLookup.wdq_to_wdqs(wdqQuery) if wdqResult and len(wdqResult) == 1: self.locations[uuid] = wdqResult[0] qNo = u'Q%d' % self.locations[uuid] return self.wd.QtoItemPage(qNo) # else: # go to geonames and find wikidata from there # add to self.locations[uuid] # add GEONAMES_ID_P to the identified wikidata # no (clean) hits return None def get_geo_sources(self, uuid): """Extract any geosources from a kulturNav uuid. Given a kulturNav uuid return the corresponding properties of that target which are likely to contain geosources. @param uuid: uuid to check @type uuid: str @return: the matching properties @rtyp: list of dicts """ # debugging if not self.is_uuid(uuid): return [] query_url = 'http://kulturnav.org/api/%s' json_data = json.load(urllib2.urlopen(query_url % uuid)) sources = [] if json_data.get(u'properties'): same_as = json_data.get('properties').get('entity.sameAs') if same_as: sources += same_as source_uri = json_data.get('properties') \ .get('superconcept.sourceUri') if source_uri: sources += source_uri return sources @staticmethod def extract_geonames(sources): """Return any geonames ID given a list of get_geo_sources(). @param sources: output of get_geo_sources() @type sources: list of dicts @return: geonames id @rtype: str or None """ needle = 'http://sws.geonames.org/' for s in sources: if s.get('value') and s.get('value').startswith(needle): return s.get('value').split('/')[-1] return None def extract_kulturarvsdata_location(self, sources): """Return any qids matching kulturarvsdata geo authorities. @param sources: output of get_geo_sources() @type sources: list of dicts @return: the matching qid (without Q-prefix) @rtype: str or None @raises pywikibot.Error """ needle = u'http://kulturarvsdata.se/resurser/aukt/geo/' for s in sources: if s.get('value') and s.get('value').startswith(needle): s = s.get('value').split('/')[-1] wdq_query = None if s.startswith('municipality#'): code = s.split('#')[-1] wdq_query = u'STRING[%s:"%s"]' % (self.SWE_KOMMUNKOD_P, code) elif s.startswith('county#'): code = s.split('#')[-1] wdq_query = u'STRING[%s:"%s"]' % (self.SWE_COUNTYKOD_P, code) elif s.startswith('country#'): pass # handle via geonames instead elif s.startswith('parish#'): pass # no id's in wikidata else: raise pywikibot.Error(u'Unhandled KulturarvsdataLocation ' u'prefix: %s' % s) if wdq_query: # only here if a municipality or county was found wdq_result = wdqsLookup.wdq_to_wdqs(wdq_query) if wdq_result and len(wdq_result) == 1: self.ADMIN_UNITS.append(wdq_result[0]) return wdq_result[0] return None def getLocationProperty(self, item, strict=True): """ Return appropriate location property for an item. Given an ItemPage this returns the suitable property which should be used to indicate its location. P17 - land P131 - within administrative unit P276 - place param item: pywikibot.ItemPage|None param strict: bool whether place should be returned if no land or admin_unit hit return string|None """ if item is not None: q = int(item.title()[1:]) if q in self.COUNTRIES: return u'P17' elif q in self.ADMIN_UNITS: return u'P131' elif not strict: return u'P%s' % self.PLACE_P elif self.verbose: item.exists() pywikibot.output(u'Could not set location property for: ' u'%s (%s)' % (item.title(), item.labels.get('sv'))) return None def kulturnav2Wikidata(self, uuid): """Return Wikidata entity connected to a kulturNav uid or url. Relies on the KULTURNAV_ID_P property (if any) to get the connection. NOTE that the WDQ results may be outdated @param uuid: a kulturNav uuid or url @type uuid: str @return: the matching Wikidata item page @rtype: pywikibot.ItemPage or None """ # debugging if not self.is_uuid(uuid): return None # Convert url to uuid if uuid.startswith(u'http://kulturnav.org'): uuid = uuid.split('/')[-1] if uuid in self.itemIds.keys(): qNo = u'Q%d' % self.itemIds[uuid] return self.wd.QtoItemPage(qNo) else: return None def is_uuid(self, uuid): """Test if a string really is a uuid. @param uuid: uuid to test @type uuid: str @return: whether the test passed @rtype: bool """ if not helpers.is_str(uuid): pywikibot.output(u'Not an uuid in %s: %s' % (self.current_uuid, uuid)) return False uuid = uuid.split('/')[-1] # in case of url pattern = r'[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}' \ r'\-[0-9a-f]{4}\-[0-9a-f]{12}' m = re.search(pattern, uuid) if not m or m.group(0) != uuid: pywikibot.output(u'Not an uuid in %s: %s' % (self.current_uuid, uuid)) return False return True @staticmethod def shuffle_names(name_obj): """Detect a "Last, First" string and return as "First Last". A wrapper for helpers.reorder_names() to send it the relevant part of a name_obj. @param name_obj: {'@language': 'xx', '@value': 'xxx'} @type name_obj: dict @return: the reordered name_obj or None if reorder_names failed @rtype: dict or None """ name = helpers.reorder_names(name_obj['@value']) if name is None: return None name_obj = name_obj.copy() name_obj['@value'] = name return name_obj def make_ref(self, date): """Make a correctly formatted ref object for claims. Contains 4 parts: * P248: Stated in <the kulturnav dataset> * P577: Publication date <from the document> * P854: Reference url <using the current uuid> * P813: Retrieval date <current date> P854 Should be in source_test (after retroactively fixing older references) but by being in source_notest we ensure that duplicate uuids don't source the statement twice. @param date: The "last modified" time of the document @type date: pywikibot.WbTime @return: the formated reference @rtype WD.Reference """ reference_url = 'http://kulturnav.org/%s' % self.current_uuid ref = WD.Reference( source_test=self.wd.make_simple_claim( 'P248', self.wd.QtoItemPage(self.DATASET_Q)), source_notest=[ self.wd.make_simple_claim('P577', date), self.wd.make_simple_claim('P854', reference_url), self.wd.make_simple_claim('P813', helpers.today_as_WbTime()) ]) return ref def add_label_or_alias(self, name_obj, item, case_sensitive=False): """Add a name as either a label (if none already) or an alias. Essentially a filter for the more generic method in WikidatStuff. @param name_obj: {'@language': 'xx', '@value': 'xxx'} or a list of such @type name_obj: dict or list of dict @param item: the item to which the label/alias should be added @type item: pywikibot.ItemPage @param caseSensitive: whether the comparison is case sensitive @type caseSensitive: bool """ # for a list of entries if isinstance(name_obj, list): for n in name_obj: self.add_label_or_alias(n, item, case_sensitive=case_sensitive) # reload item so that next call is aware of any changes item = self.wd.QtoItemPage(item.title()) item.exists() return # for a single entry self.wd.addLabelOrAlias(name_obj['@language'], name_obj['@value'], item, caseSensitive=case_sensitive) @staticmethod def get_kulturnav_generator(uuids, delay=0): """Generate KulturNav items from a list of uuids. @param uuids: uuids to request items for @type uuids: list of str @param delay: delay in seconds between each kulturnav request @type delay: int @yield: dict """ for uuid in uuids: time.sleep(delay) try: json_data = KulturnavBot.get_single_entry(uuid) except pywikibot.Error as e: pywikibot.output(e) else: yield json_data @classmethod def get_search_results(cls, max_hits=250, require_wikidata=True): """Make a KulturNav search for all items of a given type in a dataset. @param max_hits: the maximum number of results to request at once @type max_hits: int @param require_wikidata: whether to filter results on having a wikidata url in sameAs @type require_wikidata: bool @return: the resulting uuids @rtype: list of str """ search_url = 'http://kulturnav.org/api/search/' + \ 'entityType:%s,' % cls.ENTITY_TYPE + \ 'entity.dataset_r:%s' % cls.DATASET_ID q = None # the map_tag query # only filter on MAP_TAG if filtering on wikidata if require_wikidata: search_url += ',%s' % cls.MAP_TAG + ':%s/%d/%d' q = '*%2F%2Fwww.wikidata.org%2Fentity%2FQ*' else: search_url += '/%d/%d' # start search results = [] offset = 0 overview_page = KulturnavBot.get_single_search_results( search_url, q, offset, max_hits) while overview_page: for item in overview_page: uuid = item[u'uuid'] if not require_wikidata or \ KulturnavBot.has_wikidata_in_sameas(item, cls.MAP_TAG): results.append(uuid) # continue offset += max_hits overview_page = KulturnavBot.get_single_search_results( search_url, q, offset, max_hits) # some feedback pywikibot.output(u'Found %d matching entries in Kulturnav' % len(results)) return results @staticmethod def has_wikidata_in_sameas(item, map_tag): """Check if a wikidata url is present in the sameAs property. @param item: the search item to check @type item: dict @param map_tag: the tag to use (concepts don't use sameAs) @type map_tag: str @rtype: bool """ # The patterns used if we filter on wikidata patterns = (u'http://www.wikidata.org/entity/', u'https://www.wikidata.org/entity/') same_as = item[u'properties'][map_tag[:map_tag.rfind('_')]] for s in same_as: if s[u'value'].startswith(patterns): return True return False @staticmethod def get_single_search_results(search_url, q, offset, max_hits): """Retrieve the results from a single API search. @param search_url: basic url from whih to build search @type search_url: str @param q: the map_tag query, if any @type q: str or None @param offset: the offset in search results @type offset: int @param max_hits: the maximum number of results to request at once @type max_hits: int @return: the search result object @rtype: dict """ actual_url = '' if q is None: actual_url = search_url % (offset, max_hits) else: actual_url = search_url % (q, offset, max_hits) search_page = urllib2.urlopen(actual_url) return json.loads(search_page.read()) @staticmethod def get_single_entry(uuid): """Retrieve the data on a single kulturnav entry. Raises an pywikibot.Error if: * @graph is not a key in the json response * a non-json response is received @param uuid: the uuid for the target item @type uuid: str @return: the entry object @rtype: dict @raise: pywikibot.Error """ query_url = 'http://kulturnav.org/%s?format=application/ld%%2Bjson' item_url = query_url % uuid try: record_page = urllib2.urlopen(item_url) json_data = json.loads(record_page.read()) except ValueError as e: raise pywikibot.Error('Error loading KulturNav item at ' '%s with error %s' % (item_url, e)) if json_data.get(u'@graph'): return json_data else: raise pywikibot.Error('No @graph in KulturNav reply at ' '%s\n data: %s' % (item_url, json_data)) @classmethod def main(cls, *args): """Start the bot from the command line.""" options = cls.handle_args(args) search_results = cls.get_search_results( max_hits=options['max_hits'], require_wikidata=options['require_wikidata']) kulturnav_generator = cls.get_kulturnav_generator( search_results, delay=options['delay']) kulturnav_bot = cls(kulturnav_generator, options['cache_max_age']) kulturnav_bot.cutoff = options['cutoff'] kulturnav_bot.require_wikidata = options['require_wikidata'] kulturnav_bot.run() @classmethod def run_from_list(cls, uuids, *args): """Start the bot with a list of uuids.""" options = cls.handle_args(args) kulturnav_generator = cls.get_kulturnav_generator( uuids, delay=options['delay']) kulturnav_bot = cls(kulturnav_generator, options['cache_max_age']) kulturnav_bot.cutoff = options['cutoff'] kulturnav_bot.require_wikidata = False kulturnav_bot.run() @staticmethod def handle_args(args): """Parse and load all of the basic arguments. Also passes any needed arguments on to pywikibot and sets any defaults. @param args: arguments to be handled @type args: list of strings @return: list of options @rtype: dict """ options = { 'cutoff': None, 'max_hits': 250, 'delay': 0, 'require_wikidata': True, 'cache_max_age': 0, } for arg in pywikibot.handle_args(args): option, sep, value = arg.partition(':') if option == '-cutoff': options['cutoff'] = int(value) elif option == '-max_hits': options['max_hits'] = int(value) elif option == '-delay': options['delay'] = int(value) elif option == '-any_item': options['require_wikidata'] = False elif option == '-wdq_cache': options['cache_max_age'] = int(value) return options @staticmethod def foobar(item): """Badly named escape mechanism for list results.""" if isinstance(item, list): pywikibot.output(FOO_BAR) return True return False
class PaintingsBot: """Bot to enrich, and create, for items about paintings on Wikidata.""" def __init__(self, dict_generator, painting_id_prop, cache_max_age=0): """Initiate the bot, loading files and querying WDQ. @param dict_generator: The generator for the Europeana painting objects @type dict_generator: generator (that yields Dict objects). @param painting_id_prop: the P-id of the painting-id property @type painting_id_prop: str @param cache_max_age: Max age of local wdq cache, defaults to 0 @type cache_max_age: int """ self.generator = dict_generator self.repo = pywikibot.Site().data_repository() self.commons = pywikibot.Site(u'commons', u'commons') self.wd = WD(self.repo) self.add_new = False # If new objects should be created self.skip_miniatures = True # If (new) miniatures should be skipped # Load prefixes and find allowed collections collections = set([INSTITUTION_Q]) self.mappings = helpers.load_json_file('mappings.json', force_path=__file__) self.prefix_map = self.mappings['prefix_map'] self.bad_prefix = self.mappings['bad_prefix'] for p, k in self.prefix_map.iteritems(): if k['subcol'] is not None: collections.add(k['subcol'].strip('Q')) self.collections = list(collections) # Set log file self.log = codecs.open(u'nationalmuseumSE.log', 'a', 'utf-8') # Load creator dump file self.creator_dump = helpers.load_json_file('Oku_NM_arbetskopia.json', force_path=__file__) # hard-coded anons e.g. "unknown swedish 17th century" anons = helpers.load_json_file('anons.json', force_path=__file__) # prepare WDQ painting query query = u'CLAIM[195:%s] AND CLAIM[%s]' % \ (',195:'.join(self.collections), painting_id_prop) self.painting_ids = helpers.fill_cache(painting_id_prop, queryoverride=query, cache_max_age=cache_max_age) # prepare WDQ artist query (nat_mus_id - Q_id pairs) self.artist_ids = helpers.fill_cache('P2538', cache_max_age=cache_max_age) # add anons for a in anons: self.artist_ids[a] = ANON_Q self.painting_id_prop = 'P%s' % painting_id_prop def run(self): """Start the robot.""" self.creators = {} for painting in self.generator: # isolate ids ids = painting['object']['proxies'][0]['dcIdentifier']['def'] painting_id = ids[0].replace('Inv Nr.:', '').strip('( )') obj_id = ids[1] # Museum contains several sub-collections. Only handle mapped ones if painting_id.split(' ')[0] in self.prefix_map.keys(): self.process_painting(painting, painting_id, obj_id) elif painting_id.split(' ')[0] not in self.bad_prefix: pywikibot.output(u'Skipped due to unknown collection: %s' % painting_id) def process_painting(self, painting, painting_id, obj_id): """Process a single painting. This will also create it if self.add_new is True. @param painting: information object for the painting @type painting: dict @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param obj_id: the internal id of the painting in the Nationalmuseum database. @type obj_id: str """ uri = u'http://collection.nationalmuseum.se/eMuseumPlus?service=' \ u'ExternalInterface&module=collection&objectId=%s&viewType=' \ u'detailView' % obj_id europeana_url = u'http://europeana.eu/portal/record%s.html' % \ painting['object']['about'] painting_item = None # newclaims = [] if painting_id in self.painting_ids: painting_item = self.create_existing_painting( painting, painting_id) elif self.add_new and not (self.skip_miniatures and PaintingsBot.is_miniature(painting)): # if objection collection is allowed and # unless it is a miniature and we are skipping those painting_item = self.create_new_painting(painting, painting_id, europeana_url, uri) # add new claims if painting_item and painting_item.exists(): data = painting_item.get(force=True) claims = data.get('claims') # add natmus id claim self.add_natmus_id(painting_item, obj_id, uri) # add inventory number with collection self.add_inventory_and_collection_claim(painting_item, painting_id, painting, uri) # Instance_of if u'P31' not in claims: self.add_instanceof_claim(painting_item, painting_id, painting) # title (as claim) # commented out as the titles in Europeana are not reliable # if u'P1476' not in claims: # self.add_title_claim(painting_item, painting) # Europeana_ID self.add_europeana_claim(painting_item, painting) # Check for potential images to add, if none is present if u'P18' not in claims: self.add_image_claim(painting_item, uri) # creator through Nat_mus_database dump self.add_natmus_creators(painting_item, obj_id, uri) # creator IFF through dbpedia # if u'P170' not in claims: # self.add_dbpedia_creator(painting_item, painting) def add_title_claim(self, painting_item, painting): """Add a title/P1476 claim based on dcTitle. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting: information object for the painting @type painting: dict """ dc_title = painting['object']['proxies'][0]['dcTitle'] titles = [] for lang, title in dc_title.iteritems(): titles.append(pywikibot.WbMonolingualText(title[0], lang)) for title in titles: self.wd.addNewClaim(u'P1476', WD.Statement(title), painting_item, self.make_europeana_reference(painting)) def add_locatedin_claim(self, painting_item, painting_id, painting): """Add a located_in/P276 claim based on sub-collection. No longer used as sub-collection does not match actual placing. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param painting: information object for the painting @type painting: dict """ place = self.prefix_map[painting_id.split(' ')[0]]['place'] place_item = self.wd.QtoItemPage(place) self.wd.addNewClaim(u'P276', WD.Statement(place_item), painting_item, self.make_europeana_reference(painting)) def add_dbpedia_creator(self, painting_item, painting): """Add a Creator/P170 claim through a dbpedia look-up. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting: information object for the painting @type painting: dict """ creator_id = None try: db_creator = painting['object']['proxies'][1]['dcCreator']['def'] if len(db_creator) == 1: # skip anything more complex than one creator db_creator = db_creator[0].strip() if db_creator.startswith('http://dbpedia.org/resource/'): if db_creator not in self.creators.keys(): self.creators[db_creator] = \ helpers.dbpedia_2_wikidata(db_creator) creator_id = self.creators[db_creator] except KeyError: return if creator_id: self.set_creator(painting_item, self.make_europeana_reference(painting), creator_q=creator_id) def add_image_claim(self, painting_item, uri): """Add a image/P18 claim if exactly one image is found on Commons. Uses the nationalmuseum.se uri to search for matches on Commons. Adds a claim only if a unique hit is found. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param uri: reference url on nationalmuseum.se @type uri: str """ images = self.file_from_external_link(uri) if len(images) > 1: # for now don't want to choose the appropriate one pywikibot.output('Found multiple matching images for %s' % painting_item) for image in images: pywikibot.output(u'\t%s' % image) elif len(images) == 1: self.wd.addNewClaim(u'P18', WD.Statement(images[0]), painting_item, self.make_commons_reference()) def add_europeana_claim(self, painting_item, painting): """Add a Europeana ID/P727 claim. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting: information object for the painting @type painting: dict """ europeana_prop = u'P727' europeana_id = painting['object']['about'].lstrip('/') # abort if conflicting info if europeana_prop in painting_item.claims and \ not self.wd.has_claim(europeana_prop, europeana_id, painting_item): pywikibot.output(u'%s has conflicting %s. Expected %s' % (painting_item, europeana_prop, europeana_id)) return self.wd.addNewClaim(europeana_prop, WD.Statement(europeana_id), painting_item, self.make_europeana_reference(painting)) def add_instanceof_claim(self, painting_item, painting_id, painting): """Add an instance_of/P31 claim. Instance_of is always painting or icon while working on the paintings collection. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param painting: information object for the painting @type painting: dict """ dcformat_item = self.wd.QtoItemPage(PAINTING_Q) # painting if painting_id.split(' ')[0] == 'NMI': dcformat_item = self.wd.QtoItemPage(ICON_Q) # icon self.wd.addNewClaim(u'P31', WD.Statement(dcformat_item), painting_item, self.make_europeana_reference(painting)) @staticmethod def is_miniature(painting): """Determine if the painting is a miniature. @param painting: information object for the painting @type painting: dict @rtype bool """ for concept in painting['object']['concepts']: if concept[u'about'] == MINIATURE_URL: # pywikibot.output(u'Skipping miniature') return True return False def create_existing_painting(self, painting, painting_id): """Add base info to an existing paining. Adds the same info as would have been added had it been created with create_new_painting() @param painting: information object for the painting @type painting: dict @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @return: the created painting item @rtype: pywikibot.ItemPage """ painting_item = self.wd.QtoItemPage(self.painting_ids.get(painting_id)) # check label data = painting_item.get() labels = make_labels(painting) new_labels = find_new_values(data, labels, 'labels') if new_labels: pywikibot.output('Adding label to %s' % painting_item.title()) painting_item.editLabels(new_labels) # check description descriptions = make_descriptions(painting) if descriptions: new_descr = find_new_values(data, descriptions, 'descriptions') if new_descr: pywikibot.output('Adding description to %s' % painting_item.title()) painting_item.editDescriptions(new_descr) return painting_item def create_new_painting(self, painting, painting_id, europeana_url, uri): """Create a new painting item and return it. @param painting: information object for the painting @type painting: dict @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param europeana_url: reference url for Europeana @type europeana_url: str @param uri: reference uri at nationalmuseum.se @type uri: str @return: the created painting item @rtype: pywikibot.ItemPage """ data = {'labels': {}, 'descriptions': {}} data['labels'] = make_labels(painting) data['descriptions'] = make_descriptions(painting) if not data['descriptions']: return # print data # create new empty item and request Q-number summary = u'%s: Creating new item with data from %s' % (EDIT_SUMMARY, europeana_url) painting_item = None try: painting_item = self.wd.make_new_item(data, summary) except pywikibot.data.api.APIError as e: if e.code == u'modification-failed': # disambiguate and try again for lang, content in data['descriptions'].iteritems(): disambiguation = content['value'] + u' (%s)' % painting_id data['descriptions'][lang]['value'] = disambiguation try: painting_item = self.wd.make_new_item(data, summary) except pywikibot.data.api.APIError as e: if e.code == u'modification-failed': pywikibot.output(u'modification-failed error: ' u'skipping %s' % uri) return else: raise pywikibot.Error(u'Error during item creation: ' u'%s' % e) else: raise pywikibot.Error(u'Error during item creation: %s' % e) return painting_item def add_natmus_id(self, painting_item, obj_id, uri): """Add a natmus_painting_id/P2539 claim. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param obj_id: the nationalmuseum database id @type obj_id: str @param uri: reference url on nationalmuseum.se @type uri: str """ self.wd.addNewClaim(u'P2539', WD.Statement(obj_id), painting_item, self.make_url_reference(uri)) def add_natmus_creators(self, painting_item, obj_id, uri): """Add creator/P170 claim(s) based on the database dump info. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param obj_id: the nationalmuseum database id @type obj_id: str @param uri: reference url on nationalmuseum.se @type uri: str """ if obj_id not in self.creator_dump.keys(): return # each artwork may have multiple artists, # which must all be on wikidata for artist_id in self.creator_dump[obj_id].keys(): if artist_id not in self.artist_ids.keys(): self.logger('Artist not found on wikidata: %s' % artist_id) return dump_entry = self.creator_dump[obj_id] if len(dump_entry) == 1: artist_entry = dump_entry.iteritems().next() self.add_singel_natmus_creator(painting_item, artist_entry, uri) elif len(dump_entry) == 2: # self.add_double_natmus_creator(painting_item, dump_entry, uri) # skipping until duplication issue has been solved pass else: # for now avoid any entries with more creators return def add_singel_natmus_creator(self, painting_item, artist, uri): u"""Add a simple creator/P170 claim based on the database dump info. Handles cases with only a single identified creator. Either * Known creator * Unknown/uncertain creator somehow related to a known person where creator is someone whose function is in artist_labels. For Forgery/After work by the bot needs to be aware of both parties, and both must exist on Wikidata @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param artist: the dump entry for the artist @type artist: tuple (artist_id, artist_info) @param uri: reference url on nationalmuseum.se @type uri: str """ anonymous_combos = { u'Tillskriven': 'P1773', u'Hennes ateljé': 'P1774', u'Hans ateljé': 'P1774', u'Hennes skola': 'P1780', u'Hans skola': 'P1780', u'Hennes art': 'P1777', u'Hans art': 'P1777', } artist_labels = (u'Konstnär', u'Mästare', u'Utförd av') artist_id, artist_info = artist artist_q = self.artist_ids[artist_id] if artist_info.get('OkuBeschreibungS') or \ artist_info.get('OkuValidierungS'): # this always indicates some special case which we cannot handle # for now return if artist_info.get('OkuFunktionS') and \ artist_info.get('OkuFunktionS') in artist_labels: if len(artist_info.keys()) == 1: # i.e. all other are empty self.set_creator(painting_item, self.make_url_reference(uri), creator_q=artist_q) elif artist_info.get('OkuArtS') in anonymous_combos.keys() and \ len(artist_info.keys()) == 2: # anonymous but attributed to the artist related_info = { 'P': anonymous_combos[artist_info.get('OkuArtS')], 'itis': self.wd.QtoItemPage(artist_q) } self.set_creator(painting_item, self.make_url_reference(uri), related_info=related_info) elif not artist_info.get('OkuFunktionS') and artist_id == '1': # this is the special case of a completly unknown creator self.set_creator(painting_item, self.make_url_reference(uri)) def add_double_natmus_creator(self, painting_item, artists, uri): u"""Add a comlex creator/P170 claim based on the database dump info. Handles cases with two identified creators in a relation along the lines of "Painting/Forgery by X after a work by Y" The logic is: OkuFunktionS in derived_combos -> OkuKueID = creator of original OkuFunktionS in artist_labels -> OkuKueID = creator of derivative @param artists: the dump entries for the artists @type artists: dict of {artist_id: artist_info} @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param uri: reference url on nationalmuseum.se @type uri: str """ derived_combos = { u'Kopia efter': 'P1877', u'Efter': 'P1877', u'Förfalskning efter': 'P1778', } artist_labels = (u'Konstnär', u'Utförd av') # set up targets original = None derivative = None relation = None for artist in artists.iteritems(): artist_id, artist_info = artist if artist_info.get('OkuBeschreibungS') or \ artist_info.get('OkuValidierungS'): # this indicates some special case which we cannot handle # for now return if artist_info.get('OkuFunktionS') and \ len(artist_info.keys()) == 1: # cannot deal with OkuArtS if artist_info.get('OkuFunktionS') in artist_labels: derivative = artist elif artist_info.get('OkuFunktionS') in derived_combos.keys(): original = artist relation = derived_combos[artist_info.get('OkuFunktionS')] # verify that both roles were filled if any(creator is None for creator in (original, derivative)): return # construct info and set original_q = self.artist_ids[original[0]] derivative_q = self.artist_ids[derivative[0]] related_info = {'P': relation, 'itis': self.wd.QtoItemPage(original_q)} self.set_creator(painting_item, self.make_url_reference(uri), creator_q=derivative_q, related_info=related_info) def set_creator(self, target_item, reference, creator_q=None, related_info=None): """Set a creator/P170 claim for a creator or creator combo. Allows for simple claims as well as more complex "in the manner of" etc. @param target_item: item to which claim is added @type target_item: pywikibot.ItemPage @param reference: the reference for the statment @type reference: WD.Reference @param related_info: related info as a dict with P/itis pairs @type related_info: dict @param creator_q: the Q-id of the creator @type creator_q: str """ creator_q = creator_q or ANON_Q creator_statement = WD.Statement(self.wd.QtoItemPage(creator_q)) # set any related qualifiers if related_info: creator_statement.addQualifier( WD.Qualifier(P=related_info['P'], itis=related_info['itis'])) # set claim self.wd.addNewClaim(u'P170', creator_statement, target_item, reference) def add_inventory_and_collection_claim(self, painting_item, painting_id, painting, uri): """Add an inventory_no, with qualifier, and a collection/P195 claim. This will add the collection qualifier to any matching claim missing it. @param painting_item: item to which claim is added @type painting_item: pywikibot.ItemPage @param painting_id: the common (older) id of the painting in the Nationalmuseum collection @type painting_id: str @param painting: information object for the painting @type painting: dict @param uri: reference url on nationalmuseum.se @type uri: str """ nationalmuseum_item = self.wd.QtoItemPage(INSTITUTION_Q) collection_p = u'P195' # abort if conflicting info if self.painting_id_prop in painting_item.claims and \ not self.wd.has_claim(self.painting_id_prop, painting_id, painting_item): pywikibot.output( u'%s has conflicting inv. no (%s). Expected %s' % (painting_item, self.painting_id_prop, painting_id)) return # add inventory number with collection self.wd.addNewClaim( self.painting_id_prop, WD.Statement(painting_id).addQualifier(WD.Qualifier( P=collection_p, itis=nationalmuseum_item), force=True), painting_item, self.make_url_reference(uri)) # add collection (or subcollection) subcol = self.prefix_map[painting_id.split(' ')[0]]['subcol'] collection_item = nationalmuseum_item if subcol is not None: collection_item = self.wd.QtoItemPage(subcol) self.wd.addNewClaim(collection_p, WD.Statement(collection_item), painting_item, self.make_europeana_reference(painting)) def make_europeana_reference(self, painting): """Make a Reference object with a Europeana retrieval url and today's date. @param uri: retrieval uri/url @type uri: str @rtype: WD.Reference """ europeana_url = u'http://europeana.eu/portal/record%s.html' % \ painting['object']['about'] return self.make_url_reference(europeana_url) def make_url_reference(self, uri): """Make a Reference object with a retrieval url and today's date. @param uri: retrieval uri/url @type uri: str @rtype: WD.Reference """ date = helpers.today_as_WbTime() ref = WD.Reference(source_test=self.wd.make_simple_claim(u'P854', uri), source_notest=self.wd.make_simple_claim( u'P813', date)) return ref def make_commons_reference(self): """Make a Reference object saying imported from Wikimedia Commons.""" commons_item = self.wd.QtoItemPage(COMMONS_Q) ref = WD.Reference(source_test=self.wd.make_simple_claim( u'P143', commons_item)) # imported from return ref def file_from_external_link(self, uri): """Identify files from a Nationalmuseum uri. Hits are any files containing a link to the eMuseumPlus uri. @param uri: reference url on nationalmuseum.se @type uri: str @return: matching images @rtype: list """ images = [] uri = uri.split('://')[1] objgen = pagegenerators.LinksearchPageGenerator(uri, namespaces=[6], site=self.commons) for page in objgen: images.append(pywikibot.FilePage(self.commons, page.title())) # I have no clue how the above results in duplicates, but it does so... images = list(set(images)) return images def most_missed_creators(self, cache_max_age=0): """Produce list of most frequent, but unlinked, creators. Query WDQ for all objects in the collection missing an artist then put together a top-list for most desired creator """ expected_items = [] query = u'CLAIM[195:%s] AND NOCLAIM[170]' % \ ',195:'.join(self.collections) # collection wd_queryset = wdquery.QuerySet(query) wd_query = wdquery.WikidataQuery(cacheMaxAge=cache_max_age) data = wd_query.query(wd_queryset) if data.get('status').get('error') == 'OK': expected_items = data.get('items') creator_dict = {} counter = 0 for q_val in expected_items: q_item = self.wd.QtoItemPage(q_val) data = q_item.get() claims = data.get('claims') if u'P170' in claims: continue descr = data.get('descriptions').get('en') if descr and descr.startswith(u'painting by '): creator = descr[len(u'painting by '):] if '(' in creator: # to get rid of disambiguation addition creator = creator[:creator.find('(')].strip() if creator in creator_dict.keys(): creator_dict[creator] += 1 else: creator_dict[creator] = 1 counter += 1 pywikibot.output(u'Found %d mentions of %d creators' % (counter, len(creator_dict))) # output f = codecs.open(u'creatorHitlist.csv', 'w', 'utf-8') for k, v in creator_dict.iteritems(): f.write(u'%d|%s\n' % (v, k)) f.close() def logger(self, text): """Append text to logfile. @param text: text to output @type text: str """ self.log.write(u'%s\n' % text) self.log.flush() # because shit tends to crash