def _same_tree(cls, prop, data1, data2): sparql = SparqlQuery() # fixme: dependencies pattern = ('ASK { VALUES ?x1 { wd:%s } . VALUES ?x2 { wd:%s } . ' '?x1 wdt:%s* ?x2 }') item1 = ' wd:'.join(map(attrgetter('target.id'), data1)) item2 = ' wd:'.join(map(attrgetter('target.id'), data2)) tries = 3 for ask in (pattern % (item1, item2, prop), pattern % (item2, item1, prop)): res = False while True: try: res = sparql.ask(ask) except requests.exceptions.ConnectionError: tries -= 1 if tries == 0: raise time.sleep(1) continue else: break if res: return True return False
class ExternalIdSlicingBot(WikidataEntityBot): blacklist = {'P2013'} use_from_page = False def __init__(self, **options): self.available_options.update({ 'step': 10, 'offset': 0, }) super().__init__(**options) self.cache = {} self.failed = {} self.sparql = SparqlQuery(repo=self.repo) self.store = QueryStore() @property def generator(self): step = self.opt['step'] opts = { # fixme: don't use this word 'blacklist': ' wd:'.join(self.blacklist), 'limit': step, } offset = self.opt['offset'] while True: pywikibot.output('\nLoading items (offset %i)...' % offset) opts['offset'] = offset ask = self.store.build_query('ask_externalid_props', **opts) if not self.sparql.ask(ask): break query = self.store.build_query('external-ids', **opts) gen = PreloadingEntityGenerator( WikidataSPARQLPageGenerator(query, site=self.repo)) yield from gen offset += step def treat_page_and_item(self, page, item): for prop, claims in item.claims.items(): if prop in self.blacklist: continue if claims[0].type != 'external-id': continue for cl in claims: if not cl.target or not cl.target.startswith('http'): continue formatter, regex = self.get_formatter_and_regex(prop) if not formatter: pywikibot.output("%s doesn't have a formatter" % prop) break value = self.find_value(cl.target, formatter) if not value: pywikibot.output( 'Value not found in "%s" for property %s' % (cl.target, prop)) self.failed.setdefault(prop, set()).add(item) continue if regex: try: match = re.match('(%s)' % regex, value) except re.error: pywikibot.output('Couldn\'t apply regex "%s"' % regex) break if not match: pywikibot.output('Value "%s" not matched by regex ' '"%s"' % (value, regex)) self.failed.setdefault(prop, set()).add(item) continue value = match.group() summary = 'harvested the identifier based on [[Property:P1630]]' if regex: summary += ' and [[Property:P1793]]' cl.changeTarget(value, summary=summary) def get_formatter_and_regex(self, prop): if prop not in self.cache: formatter = regex = None ppage = pywikibot.PropertyPage(self.repo, prop) if 'P1630' in ppage.claims: if len(ppage.claims['P1630']) > 1: preferred = [ cl for cl in ppage.claims['P1630'] if cl.rank == 'preferred' ] if len(preferred) == 1: formatter = preferred[0].target else: formatter = ppage.claims['P1630'][0].target if 'P1793' in ppage.claims: if len(ppage.claims['P1793']) > 1: preferred = [ cl for cl in ppage.claims['P1793'] if cl.rank == 'preferred' ] if len(preferred) == 1: regex = preferred[0].target else: regex = ppage.claims['P1793'][0].target self.cache[prop] = (formatter, regex) return self.cache[prop] def strip_init_stuff(self, string): if string.startswith(('http://', 'https://')): string = string.partition('//')[2] if string.startswith('www.'): string = string[4:] return string def find_value(self, url, formatter): url = self.strip_init_stuff(url) formatter = self.strip_init_stuff(formatter) value = pywikibot.page.url2unicode(url) split = formatter.split('$1') if not value.startswith(split[0]): return None if not split[1]: return value[len(split[0]):].rstrip('/') value = value[:-len(split[-1])] try: index = value.index(split[1], len(split[0])) except ValueError: return None else: return value[len(split[0]):index].rstrip('/') def exit(self): if self.failed: text = '' for prop in sorted(self.failed): text += '* [[Property:%s]]:\n' % prop for item in sorted(self.failed[prop]): text += '** [[%s]]\n' % item.title() page = pywikibot.Page( self.repo, 'User:%s/Wrong external ids' % self.repo.username()) page.put(text, summary='update') super().exit()
class DuosManagingBot(WikidataEntityBot): conj = { 'af': ' en ', 'az': ' və ', 'bg': ' и ', 'br': ' ha ', 'ca': ' i ', 'cs': ' a ', 'cy': ' a ', 'da': ' og ', 'de': ' und ', 'el': ' και ', 'en': ' and ', 'en-gb': ' and ', 'eo': ' kaj ', 'es': ' y ', 'et': ' ja ', 'eu': ' eta ', 'fi': ' ja ', 'fr': ' et ', 'fy': ' en ', 'gl': ' e ', 'hr': ' i ', 'hu': ' és ', 'id': ' dan ', 'it': ' e ', 'ka': ' და ', 'la': ' et ', 'lt': ' ir ', 'lv': ' un ', 'ms': ' dan ', 'nb': ' og ', 'nl': ' en ', 'nn': ' og ', 'oc': ' e ', 'pl': ' i ', 'pt': ' e ', 'ro': ' și ', 'ru': ' и ', 'sk': ' a ', 'sl': ' in ', 'sr': ' и ', 'sv': ' och ', 'tr': ' ve ', 'uk': ' і ', 'vi': ' và ', 'war': ' ngan ', } distribute_properties = [ 'P21', 'P22', 'P25', 'P27', 'P40', 'P53', 'P106', 'P1412', ] class_to_relation = [ ('Q106925878', 'father-son'), ('Q14756018', 'twin'), ('Q14073567', 'sibling'), ('Q3046146', 'spouse'), # TODO: ('Q1141470', 'comedians'), not a "relation by blood" ] relation_map = { 'sibling': 'P3373', 'spouse': 'P26', 'twin': 'P3373', # TODO: 'partner': 'P451', #'father-son': '', we don't know who is who #'comedians': 'P1327', } use_from_page = False def __init__(self, generator, **kwargs): self.available_options.update({ 'always': True, 'class': 'Q10648343', 'min_labels': 1, }) super().__init__(**kwargs) self.store = QueryStore() self.sparql = SparqlQuery(repo=self.repo) self._generator = generator or self.custom_generator() def skip_page(self, item): if super().skip_page(item): return True if 'P31' not in item.claims: pywikibot.output('%s is missing P31 property' % item) return True if 'P527' in item.claims: pywikibot.output('%s already has P527 property' % item) return True return False def custom_generator(self): kwargs = {'class': self.opt['class']} query = self.store.build_query('duos', **kwargs) return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) @property def generator(self): return pagegenerators.PreloadingEntityGenerator(self._generator) def get_relation(self, item): ask_pattern = 'ASK { wd:%s wdt:P31/wdt:P279* wd:%%s }' % item.id for key, rel in self.class_to_relation: if self.sparql.ask(ask_pattern % key): return rel return None def get_labels(self, item, relation): labels = [{}, {}] for lang in item.labels.keys() & self.conj.keys(): for conj in (self.conj[lang], ' & '): label = item.labels[lang].partition(' (')[0] if ', ' in label: continue split = label.split(conj) if len(split) != 2: continue split0 = split[0].split() split1 = split[1].split() if split1[0].islower(): continue # TODO: if len(split1) > 1 and split1[0][-1] == '.': if len(split1) > len(split0): if len(split1) > 2 and split1[-2].islower(): split1[-2:] = [' '.join(split1[-2:])] if len(split1) - len(split0) == 1: # if items are in a relation, then # they probably share their surname if relation: split[0] += ' %s' % split1[-1] split0.append(split1[-1]) if len(split0) > 1 or len(split1) == 1: labels[0][lang] = split[0] labels[1][lang] = split[1] break return labels def treat_page_and_item(self, page, item): relation = self.get_relation(item) labels = self.get_labels(item, relation) count = max(map(len, labels)) if count == 0: pywikibot.output('No labels, skipping...') return if count < self.opt['min_labels']: pywikibot.output('Too few labels (%i), skipping...' % count) return to_add = [] to_remove = [] for prop in self.distribute_properties: for claim in item.claims.get(prop, []): if claim.getTarget(): to_remove.append(claim) json = claim.toJSON() json.pop('id') to_add.append(json) items = [self.create_item(item, data, relation, to_add) for data in labels] if self.relation_map.get(relation): for it, target in zip(items, reversed(items)): claim = pywikibot.Claim(self.repo, self.relation_map[relation]) claim.setTarget(target) self.user_add_claim(it, claim) for it in items: claim = pywikibot.Claim(self.repo, 'P527') claim.setTarget(it) self.user_add_claim(item, claim) for claim in to_remove: pywikibot.output('Removing %s --> %s' % ( claim.id, claim.getTarget())) json = claim.toJSON() json['remove'] = '' summary = 'moved [[Property:{}]] to {} & {}'.format( claim.id, items[0].title(as_link=True, insite=self.repo), items[1].title(as_link=True, insite=self.repo) ) self.user_edit_entity(item, {'claims':[json]}, summary=summary) def create_item(self, item, labels, relation, to_add): pywikibot.output('Creating item (relation "%s")...' % relation) new_item = pywikibot.ItemPage(self.repo) self.user_edit_entity( new_item, {'labels': labels}, asynchronous=False, summary='based on data in %s' % item.title( as_link=True, insite=self.repo)) claim = pywikibot.Claim(self.repo, 'P31') claim.setTarget(pywikibot.ItemPage(self.repo, 'Q5')) self.user_add_claim(new_item, claim) claim = pywikibot.Claim(self.repo, 'P361') claim.setTarget(item) self.user_add_claim(new_item, claim) for json in to_add: temp_claim = pywikibot.Claim.fromJSON(self.repo, json) pywikibot.output('Adding %s --> %s' % ( temp_claim.id, temp_claim.getTarget())) self.user_edit_entity( new_item, {'claims':[json]}, summary='moving [[Property:%s]] from %s' % ( temp_claim.id, item.title(as_link=True, insite=self.repo))) return new_item
class DuosManagingBot(WikidataEntityBot): conj = { 'af': ' en ', 'az': ' və ', 'bg': ' и ', 'br': ' ha ', 'ca': ' i ', 'cs': ' a ', 'cy': ' a ', 'da': ' og ', 'de': ' und ', 'el': ' και ', 'en': ' and ', 'en-gb': ' and ', 'eo': ' kaj ', 'es': ' y ', 'et': ' ja ', 'eu': ' eta ', 'fi': ' ja ', 'fr': ' et ', 'fy': ' en ', 'gl': ' e ', 'hr': ' i ', 'hu': ' és ', 'id': ' dan ', 'it': ' e ', 'ka': ' და ', 'la': ' et ', 'lt': ' ir ', 'lv': ' un ', 'ms': ' dan ', 'nb': ' og ', 'nl': ' en ', 'nn': ' og ', 'oc': ' e ', 'pl': ' i ', 'pt': ' e ', 'ro': ' și ', 'ru': ' и ', 'sk': ' a ', 'sl': ' in ', 'sr': ' и ', 'sv': ' och ', 'tr': ' ve ', 'uk': ' і ', 'vi': ' và ', 'war': ' ngan ', } distribute_properties = { 'P21', 'P22', 'P25', 'P27', 'P40', 'P53', 'P106', 'P1412', } class_to_relation = [ ('Q14756018', 'twin'), ('Q14073567', 'sibling'), ('Q3046146', 'spouse'), ] relation_map = { #'partner': 'P451', todo 'sibling': 'P3373', 'spouse': 'P26', 'twin': 'P3373', } use_from_page = False def __init__(self, generator, **kwargs): self.availableOptions.update({ 'always': True, 'class': 'Q15618652', 'min_labels': 1, }) super(DuosManagingBot, self).__init__(**kwargs) self.store = QueryStore() self.sparql = SparqlQuery(repo=self.repo) self._generator = generator or self.custom_generator() def skip_page(self, item): if super(DuosManagingBot, self).skip_page(item): return True if 'P31' not in item.claims: pywikibot.output('%s is missing P31 property' % item) return True if 'P527' in item.claims: pywikibot.output('%s already has P527 property' % item) return True return False def custom_generator(self): kwargs = {'class': self.getOption('class')} query = self.store.build_query('duos', **kwargs) return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo) @property def generator(self): return pagegenerators.PreloadingEntityGenerator(self._generator) def get_relation(self, item): ask_pattern = 'ASK { wd:%s wdt:P31/wdt:P279* wd:%%s }' % item.id for key, rel in self.class_to_relation: if self.sparql.ask(ask_pattern % key): return rel return None def get_labels(self, item, relation): labels = [{}, {}] for lang in set(item.labels.keys()) & set(self.conj.keys()): for conj in (self.conj[lang], ' & '): label = item.labels[lang].partition(' (')[0] if ', ' in label: continue split = label.split(conj) if len(split) != 2: continue split0 = split[0].split() split1 = split[1].split() if split1[0].islower(): continue if len(split1) > len(split0): if len(split1) > 2 and split1[-2].islower(): split1[-2:] = [' '.join(split1[-2:])] if len(split1) - len(split0) == 1: # if items are in a relation, then they probably share # their surname if relation: split[0] += ' %s' % split1[-1] split0.append(split1[-1]) if len(split0) > 1 or len(split1) == 1: for i in [0, 1]: labels[i][lang] = split[i] break return labels def treat_page_and_item(self, page, item): relation = self.get_relation(item) labels = self.get_labels(item, relation) count = max(map(len, labels)) if count == 0: pywikibot.output('No labels, skipping...') return if count < self.getOption('min_labels'): pywikibot.output('Too few labels (%i), skipping...' % count) return to_add = [] to_remove = [] for prop in self.distribute_properties: for claim in item.claims.get(prop, []): if claim.getTarget(): to_remove.append(claim) json = claim.toJSON() json.pop('id') to_add.append(json) items = [self.create_item(item, data, relation, to_add) for data in labels] if self.relation_map.get(relation): for it, target in zip(items, reversed(items)): claim = pywikibot.Claim(self.repo, self.relation_map[relation]) claim.setTarget(target) self.user_add_claim(it, claim) for it in items: claim = pywikibot.Claim(self.repo, 'P527') claim.setTarget(it) self.user_add_claim(item, claim) for claim in to_remove: pywikibot.output('Removing %s --> %s' % ( claim.id, claim.getTarget())) json = claim.toJSON() json['remove'] = '' self.user_edit_entity( item, {'claims':[json]}, summary='moved [[Property:%s]] to %s' % ( claim.id, ' & '.join(map(methodcaller( 'title', as_link=True, insite=self.repo), items)))) def create_item(self, item, labels, relation, to_add): pywikibot.output('Creating item (relation "%s")...' % relation) new_item = pywikibot.ItemPage(self.repo) data = {'labels': labels} self.user_edit_entity( new_item, data, summary='based on data in %s' % item.title( as_link=True, insite=self.repo), asynchronous=False) claim = pywikibot.Claim(self.repo, 'P31') claim.setTarget(pywikibot.ItemPage(self.repo, 'Q5')) self.user_add_claim(new_item, claim) ## if relation == 'twin': ## claim = pywikibot.Claim(self.repo, 'P31') ## claim.setTarget(pywikibot.ItemPage(self.repo, 'Q159979')) ## self.user_add_claim(new_item, claim) claim = pywikibot.Claim(self.repo, 'P361') claim.setTarget(item) self.user_add_claim(new_item, claim) for json in to_add: temp_claim = pywikibot.Claim.fromJSON(self.repo, json) pywikibot.output('Adding %s --> %s' % ( temp_claim.id, temp_claim.getTarget())) self.user_edit_entity( new_item, {'claims':[json]}, summary='moving [[Property:%s]] from %s' % ( temp_claim.id, item.title(as_link=True, insite=self.repo))) return new_item