Exemple #1
0
    def _same_tree(cls, prop, data1, data2):
        sparql = SparqlQuery()  # fixme: dependencies
        pattern = ('ASK { VALUES ?x1 { wd:%s } . VALUES ?x2 { wd:%s } . '
                   '?x1 wdt:%s* ?x2 }')
        item1 = ' wd:'.join(map(attrgetter('target.id'), data1))
        item2 = ' wd:'.join(map(attrgetter('target.id'), data2))
        tries = 3
        for ask in (pattern % (item1, item2, prop),
                    pattern % (item2, item1, prop)):
            res = False
            while True:
                try:
                    res = sparql.ask(ask)
                except requests.exceptions.ConnectionError:
                    tries -= 1
                    if tries == 0:
                        raise
                    time.sleep(1)
                    continue
                else:
                    break
            if res:
                return True

        return False
    def _same_tree(cls, prop, data1, data2):
        sparql = SparqlQuery() # fixme: dependencies
        pattern = ('ASK { VALUES ?x1 { wd:%s } . VALUES ?x2 { wd:%s } . '
                   '?x1 wdt:%s* ?x2 }')
        item1 = ' wd:'.join(map(attrgetter('target.id'), data1))
        item2 = ' wd:'.join(map(attrgetter('target.id'), data2))
        tries = 3
        for ask in (pattern % (item1, item2, prop),
                    pattern % (item2, item1, prop)):
            res = False
            while True:
                try:
                    res = sparql.ask(ask)
                except requests.exceptions.ConnectionError:
                    tries -= 1
                    if tries == 0:
                        raise
                    time.sleep(1)
                    continue
                else:
                    break
            if res:
                return True

        return False
class ExternalIdSlicingBot(WikidataEntityBot):

    blacklist = {'P2013'}
    use_from_page = False

    def __init__(self, **options):
        self.available_options.update({
            'step': 10,
            'offset': 0,
        })
        super().__init__(**options)
        self.cache = {}
        self.failed = {}
        self.sparql = SparqlQuery(repo=self.repo)
        self.store = QueryStore()

    @property
    def generator(self):
        step = self.opt['step']
        opts = {
            # fixme: don't use this word
            'blacklist': ' wd:'.join(self.blacklist),
            'limit': step,
        }
        offset = self.opt['offset']
        while True:
            pywikibot.output('\nLoading items (offset %i)...' % offset)
            opts['offset'] = offset
            ask = self.store.build_query('ask_externalid_props', **opts)
            if not self.sparql.ask(ask):
                break
            query = self.store.build_query('external-ids', **opts)
            gen = PreloadingEntityGenerator(
                WikidataSPARQLPageGenerator(query, site=self.repo))
            yield from gen
            offset += step

    def treat_page_and_item(self, page, item):
        for prop, claims in item.claims.items():
            if prop in self.blacklist:
                continue
            if claims[0].type != 'external-id':
                continue
            for cl in claims:
                if not cl.target or not cl.target.startswith('http'):
                    continue
                formatter, regex = self.get_formatter_and_regex(prop)
                if not formatter:
                    pywikibot.output("%s doesn't have a formatter" % prop)
                    break
                value = self.find_value(cl.target, formatter)
                if not value:
                    pywikibot.output(
                        'Value not found in "%s" for property %s' %
                        (cl.target, prop))
                    self.failed.setdefault(prop, set()).add(item)
                    continue
                if regex:
                    try:
                        match = re.match('(%s)' % regex, value)
                    except re.error:
                        pywikibot.output('Couldn\'t apply regex "%s"' % regex)
                        break
                    if not match:
                        pywikibot.output('Value "%s" not matched by regex '
                                         '"%s"' % (value, regex))
                        self.failed.setdefault(prop, set()).add(item)
                        continue
                    value = match.group()
                summary = 'harvested the identifier based on [[Property:P1630]]'
                if regex:
                    summary += ' and [[Property:P1793]]'
                cl.changeTarget(value, summary=summary)

    def get_formatter_and_regex(self, prop):
        if prop not in self.cache:
            formatter = regex = None
            ppage = pywikibot.PropertyPage(self.repo, prop)
            if 'P1630' in ppage.claims:
                if len(ppage.claims['P1630']) > 1:
                    preferred = [
                        cl for cl in ppage.claims['P1630']
                        if cl.rank == 'preferred'
                    ]
                    if len(preferred) == 1:
                        formatter = preferred[0].target
                else:
                    formatter = ppage.claims['P1630'][0].target

            if 'P1793' in ppage.claims:
                if len(ppage.claims['P1793']) > 1:
                    preferred = [
                        cl for cl in ppage.claims['P1793']
                        if cl.rank == 'preferred'
                    ]
                    if len(preferred) == 1:
                        regex = preferred[0].target
                else:
                    regex = ppage.claims['P1793'][0].target

            self.cache[prop] = (formatter, regex)

        return self.cache[prop]

    def strip_init_stuff(self, string):
        if string.startswith(('http://', 'https://')):
            string = string.partition('//')[2]
        if string.startswith('www.'):
            string = string[4:]
        return string

    def find_value(self, url, formatter):
        url = self.strip_init_stuff(url)
        formatter = self.strip_init_stuff(formatter)
        value = pywikibot.page.url2unicode(url)
        split = formatter.split('$1')
        if not value.startswith(split[0]):
            return None
        if not split[1]:
            return value[len(split[0]):].rstrip('/')

        value = value[:-len(split[-1])]

        try:
            index = value.index(split[1], len(split[0]))
        except ValueError:
            return None
        else:
            return value[len(split[0]):index].rstrip('/')

    def exit(self):
        if self.failed:
            text = ''
            for prop in sorted(self.failed):
                text += '* [[Property:%s]]:\n' % prop
                for item in sorted(self.failed[prop]):
                    text += '** [[%s]]\n' % item.title()
            page = pywikibot.Page(
                self.repo, 'User:%s/Wrong external ids' % self.repo.username())
            page.put(text, summary='update')
        super().exit()
class DuosManagingBot(WikidataEntityBot):

    conj = {
        'af': ' en ',
        'az': ' və ',
        'bg': ' и ',
        'br': ' ha ',
        'ca': ' i ',
        'cs': ' a ',
        'cy': ' a ',
        'da': ' og ',
        'de': ' und ',
        'el': ' και ',
        'en': ' and ',
        'en-gb': ' and ',
        'eo': ' kaj ',
        'es': ' y ',
        'et': ' ja ',
        'eu': ' eta ',
        'fi': ' ja ',
        'fr': ' et ',
        'fy': ' en ',
        'gl': ' e ',
        'hr': ' i ',
        'hu': ' és ',
        'id': ' dan ',
        'it': ' e ',
        'ka': ' და ',
        'la': ' et ',
        'lt': ' ir ',
        'lv': ' un ',
        'ms': ' dan ',
        'nb': ' og ',
        'nl': ' en ',
        'nn': ' og ',
        'oc': ' e ',
        'pl': ' i ',
        'pt': ' e ',
        'ro': ' și ',
        'ru': ' и ',
        'sk': ' a ',
        'sl': ' in ',
        'sr': ' и ',
        'sv': ' och ',
        'tr': ' ve ',
        'uk': ' і ',
        'vi': ' và ',
        'war': ' ngan ',
    }
    distribute_properties = [
        'P21', 'P22', 'P25', 'P27', 'P40', 'P53', 'P106', 'P1412',
    ]
    class_to_relation = [
        ('Q106925878', 'father-son'),
        ('Q14756018', 'twin'),
        ('Q14073567', 'sibling'),
        ('Q3046146', 'spouse'),
        # TODO: ('Q1141470', 'comedians'), not a "relation by blood"
    ]
    relation_map = {
        'sibling': 'P3373',
        'spouse': 'P26',
        'twin': 'P3373',
        # TODO: 'partner': 'P451',
        #'father-son': '', we don't know who is who
        #'comedians': 'P1327',
    }
    use_from_page = False

    def __init__(self, generator, **kwargs):
        self.available_options.update({
            'always': True,
            'class': 'Q10648343',
            'min_labels': 1,
        })
        super().__init__(**kwargs)
        self.store = QueryStore()
        self.sparql = SparqlQuery(repo=self.repo)
        self._generator = generator or self.custom_generator()

    def skip_page(self, item):
        if super().skip_page(item):
            return True
        if 'P31' not in item.claims:
            pywikibot.output('%s is missing P31 property' % item)
            return True
        if 'P527' in item.claims:
            pywikibot.output('%s already has P527 property' % item)
            return True
        return False

    def custom_generator(self):
        kwargs = {'class': self.opt['class']}
        query = self.store.build_query('duos', **kwargs)
        return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo)

    @property
    def generator(self):
        return pagegenerators.PreloadingEntityGenerator(self._generator)

    def get_relation(self, item):
        ask_pattern = 'ASK { wd:%s wdt:P31/wdt:P279* wd:%%s }' % item.id
        for key, rel in self.class_to_relation:
            if self.sparql.ask(ask_pattern % key):
                return rel
        return None

    def get_labels(self, item, relation):
        labels = [{}, {}]
        for lang in item.labels.keys() & self.conj.keys():
            for conj in (self.conj[lang], ' & '):
                label = item.labels[lang].partition(' (')[0]
                if ', ' in label:
                    continue
                split = label.split(conj)
                if len(split) != 2:
                    continue
                split0 = split[0].split()
                split1 = split[1].split()
                if split1[0].islower():
                    continue
                # TODO: if len(split1) > 1 and split1[0][-1] == '.':
                if len(split1) > len(split0):
                    if len(split1) > 2 and split1[-2].islower():
                        split1[-2:] = [' '.join(split1[-2:])]
                    if len(split1) - len(split0) == 1:
                        # if items are in a relation, then
                        # they probably share their surname
                        if relation:
                            split[0] += ' %s' % split1[-1]
                            split0.append(split1[-1])
                if len(split0) > 1 or len(split1) == 1:
                    labels[0][lang] = split[0]
                    labels[1][lang] = split[1]
                    break

        return labels

    def treat_page_and_item(self, page, item):
        relation = self.get_relation(item)
        labels = self.get_labels(item, relation)
        count = max(map(len, labels))
        if count == 0:
            pywikibot.output('No labels, skipping...')
            return

        if count < self.opt['min_labels']:
            pywikibot.output('Too few labels (%i), skipping...' % count)
            return

        to_add = []
        to_remove = []
        for prop in self.distribute_properties:
            for claim in item.claims.get(prop, []):
                if claim.getTarget():
                    to_remove.append(claim)
                    json = claim.toJSON()
                    json.pop('id')
                    to_add.append(json)

        items = [self.create_item(item, data, relation, to_add)
                 for data in labels]
        if self.relation_map.get(relation):
            for it, target in zip(items, reversed(items)):
                claim = pywikibot.Claim(self.repo, self.relation_map[relation])
                claim.setTarget(target)
                self.user_add_claim(it, claim)

        for it in items:
            claim = pywikibot.Claim(self.repo, 'P527')
            claim.setTarget(it)
            self.user_add_claim(item, claim)

        for claim in to_remove:
            pywikibot.output('Removing %s --> %s' % (
                claim.id, claim.getTarget()))
            json = claim.toJSON()
            json['remove'] = ''
            summary = 'moved [[Property:{}]] to {} & {}'.format(
                claim.id,
                items[0].title(as_link=True, insite=self.repo),
                items[1].title(as_link=True, insite=self.repo)
            )
            self.user_edit_entity(item, {'claims':[json]}, summary=summary)

    def create_item(self, item, labels, relation, to_add):
        pywikibot.output('Creating item (relation "%s")...' % relation)
        new_item = pywikibot.ItemPage(self.repo)
        self.user_edit_entity(
            new_item,
            {'labels': labels},
            asynchronous=False,
            summary='based on data in %s' % item.title(
                as_link=True, insite=self.repo))

        claim = pywikibot.Claim(self.repo, 'P31')
        claim.setTarget(pywikibot.ItemPage(self.repo, 'Q5'))
        self.user_add_claim(new_item, claim)
        claim = pywikibot.Claim(self.repo, 'P361')
        claim.setTarget(item)
        self.user_add_claim(new_item, claim)
        for json in to_add:
            temp_claim = pywikibot.Claim.fromJSON(self.repo, json)
            pywikibot.output('Adding %s --> %s' % (
                temp_claim.id, temp_claim.getTarget()))
            self.user_edit_entity(
                new_item, {'claims':[json]},
                summary='moving [[Property:%s]] from %s' % (
                    temp_claim.id,
                    item.title(as_link=True, insite=self.repo)))
        return new_item
class DuosManagingBot(WikidataEntityBot):

    conj = {
        'af': ' en ',
        'az': ' və ',
        'bg': ' и ',
        'br': ' ha ',
        'ca': ' i ',
        'cs': ' a ',
        'cy': ' a ',
        'da': ' og ',
        'de': ' und ',
        'el': ' και ',
        'en': ' and ',
        'en-gb': ' and ',
        'eo': ' kaj ',
        'es': ' y ',
        'et': ' ja ',
        'eu': ' eta ',
        'fi': ' ja ',
        'fr': ' et ',
        'fy': ' en ',
        'gl': ' e ',
        'hr': ' i ',
        'hu': ' és ',
        'id': ' dan ',
        'it': ' e ',
        'ka': ' და ',
        'la': ' et ',
        'lt': ' ir ',
        'lv': ' un ',
        'ms': ' dan ',
        'nb': ' og ',
        'nl': ' en ',
        'nn': ' og ',
        'oc': ' e ',
        'pl': ' i ',
        'pt': ' e ',
        'ro': ' și ',
        'ru': ' и ',
        'sk': ' a ',
        'sl': ' in ',
        'sr': ' и ',
        'sv': ' och ',
        'tr': ' ve ',
        'uk': ' і ',
        'vi': ' và ',
        'war': ' ngan ',
    }
    distribute_properties = {
        'P21', 'P22', 'P25', 'P27', 'P40', 'P53', 'P106', 'P1412',
    }
    class_to_relation = [
        ('Q14756018', 'twin'),
        ('Q14073567', 'sibling'),
        ('Q3046146', 'spouse'),
    ]
    relation_map = {
        #'partner': 'P451', todo
        'sibling': 'P3373',
        'spouse': 'P26',
        'twin': 'P3373',
    }
    use_from_page = False

    def __init__(self, generator, **kwargs):
        self.availableOptions.update({
            'always': True,
            'class': 'Q15618652',
            'min_labels': 1,
        })
        super(DuosManagingBot, self).__init__(**kwargs)
        self.store = QueryStore()
        self.sparql = SparqlQuery(repo=self.repo)
        self._generator = generator or self.custom_generator()

    def skip_page(self, item):
        if super(DuosManagingBot, self).skip_page(item):
            return True
        if 'P31' not in item.claims:
            pywikibot.output('%s is missing P31 property' % item)
            return True
        if 'P527' in item.claims:
            pywikibot.output('%s already has P527 property' % item)
            return True
        return False

    def custom_generator(self):
        kwargs = {'class': self.getOption('class')}
        query = self.store.build_query('duos', **kwargs)
        return pagegenerators.WikidataSPARQLPageGenerator(query, site=self.repo)

    @property
    def generator(self):
        return pagegenerators.PreloadingEntityGenerator(self._generator)

    def get_relation(self, item):
        ask_pattern = 'ASK { wd:%s wdt:P31/wdt:P279* wd:%%s }' % item.id
        for key, rel in self.class_to_relation:
            if self.sparql.ask(ask_pattern % key):
                return rel
        return None

    def get_labels(self, item, relation):
        labels = [{}, {}]
        for lang in set(item.labels.keys()) & set(self.conj.keys()):
            for conj in (self.conj[lang], ' & '):
                label = item.labels[lang].partition(' (')[0]
                if ', ' in label:
                    continue
                split = label.split(conj)
                if len(split) != 2:
                    continue
                split0 = split[0].split()
                split1 = split[1].split()
                if split1[0].islower():
                    continue
                if len(split1) > len(split0):
                    if len(split1) > 2 and split1[-2].islower():
                        split1[-2:] = [' '.join(split1[-2:])]
                    if len(split1) - len(split0) == 1:
                        # if items are in a relation, then they probably share
                        # their surname
                        if relation:
                            split[0] += ' %s' % split1[-1]
                            split0.append(split1[-1])
                if len(split0) > 1 or len(split1) == 1:
                    for i in [0, 1]:
                        labels[i][lang] = split[i]
                    break

        return labels

    def treat_page_and_item(self, page, item):
        relation = self.get_relation(item)
        labels = self.get_labels(item, relation)
        count = max(map(len, labels))
        if count == 0:
            pywikibot.output('No labels, skipping...')
            return

        if count < self.getOption('min_labels'):
            pywikibot.output('Too few labels (%i), skipping...' % count)
            return

        to_add = []
        to_remove = []
        for prop in self.distribute_properties:
            for claim in item.claims.get(prop, []):
                if claim.getTarget():
                    to_remove.append(claim)
                    json = claim.toJSON()
                    json.pop('id')
                    to_add.append(json)

        items = [self.create_item(item, data, relation, to_add)
                 for data in labels]
        if self.relation_map.get(relation):
            for it, target in zip(items, reversed(items)):
                claim = pywikibot.Claim(self.repo, self.relation_map[relation])
                claim.setTarget(target)
                self.user_add_claim(it, claim)

        for it in items:
            claim = pywikibot.Claim(self.repo, 'P527')
            claim.setTarget(it)
            self.user_add_claim(item, claim)

        for claim in to_remove:
            pywikibot.output('Removing %s --> %s' % (
                claim.id, claim.getTarget()))
            json = claim.toJSON()
            json['remove'] = ''
            self.user_edit_entity(
                item, {'claims':[json]},
                summary='moved [[Property:%s]] to %s' % (
                    claim.id, ' & '.join(map(methodcaller(
                        'title', as_link=True, insite=self.repo), items))))

    def create_item(self, item, labels, relation, to_add):
        pywikibot.output('Creating item (relation "%s")...' % relation)
        new_item = pywikibot.ItemPage(self.repo)
        data = {'labels': labels}
        self.user_edit_entity(
            new_item, data, summary='based on data in %s' % item.title(
                as_link=True, insite=self.repo), asynchronous=False)

        claim = pywikibot.Claim(self.repo, 'P31')
        claim.setTarget(pywikibot.ItemPage(self.repo, 'Q5'))
        self.user_add_claim(new_item, claim)
##        if relation == 'twin':
##            claim = pywikibot.Claim(self.repo, 'P31')
##            claim.setTarget(pywikibot.ItemPage(self.repo, 'Q159979'))
##            self.user_add_claim(new_item, claim)

        claim = pywikibot.Claim(self.repo, 'P361')
        claim.setTarget(item)
        self.user_add_claim(new_item, claim)
        for json in to_add:
            temp_claim = pywikibot.Claim.fromJSON(self.repo, json)
            pywikibot.output('Adding %s --> %s' % (
                temp_claim.id, temp_claim.getTarget()))
            self.user_edit_entity(
                new_item, {'claims':[json]},
                summary='moving [[Property:%s]] from %s' % (
                    temp_claim.id,
                    item.title(as_link=True, insite=self.repo)))
        return new_item