/
election.py
192 lines (169 loc) · 11.9 KB
/
election.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# -*- coding: utf-8 -*-
#import pwb #only needed if you haven't installed the framework as side-package
import pywikibot
import sys
import os
import re
import json
PROJECT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(PROJECT_DIR, 'pywikibot'))
import pywikibot.catlib as catlib
import pywikibot.page as page
import dateutil.parser as parser
class HarvestRobot:
"""
A bot to add Wikidata claims
"""
def __init__(self, site, templateTitle, fields, valuesConstraints):
"""
Arguments:
* generator - A generator that yields Page objects.
* templateTitle - The template to work on
* fields - A dictionary of fields that are of use to us
"""
self.templateTitle = templateTitle.replace(u'_', u' ')
self.fields = fields
self.valuesConstraints = valuesConstraints
self.site = site
self.repo = self.site.data_repository()
self.setSource(site.language())
def setSource(self, lang):
'''
Get the source
'''
page = pywikibot.Page(self.repo, 'Wikidata:List of wikis/python')
source_values = json.loads(page.get())
source_values = source_values['wikipedia']
for langue in source_values:
source_values[langue] = pywikibot.ItemPage(self.repo, source_values[langue])
if lang in source_values:
self.source = pywikibot.Claim(self.repo, 'p143')
self.source.setTarget(source_values.get(lang))
def procesPage(self, site, page):
"""
Proces a single page
"""
item = pywikibot.ItemPage.fromPage(page)
pywikibot.output('Processing %s' % page)
if not item.exists():
pywikibot.output('%s doesn\'t have a wikidata item :' % page)
#TODO FIXME: We should provide an option to create the page
else:
pagetext = page.get()
pagetext = pywikibot.removeDisabledParts(pagetext)
templates = pywikibot.extract_templates_and_params(pagetext)
for (template, fielddict) in templates:
# We found the template we were looking for
linkedTemplate = pywikibot.Page(self.site, template, ns=10)
if linkedTemplate.isRedirectPage():
template2 = linkedTemplate.getRedirectTarget().title()
pywikibot.output(
u'Template redirection from %s to %s'
% (template, template2))
template = template2[9:]
if template.replace(u'_', u' ') == self.templateTitle:
for field, value in fielddict.items():
# This field contains something useful for us
# pywikibot.output(' attribut %s' % field)
if (value != "") and (field in self.fields):
# Check if the property isn't already set
claim = self.fields[field]
if claim[2:-2] in item.get().get('claims'):
pywikibot.output(
u'A claim for %s already exists. Skipping'
% (claim,))
# TODO FIXME: This is a very crude way of dupe
# checking
else:
# Try to extract a valid page
match = re.search(re.compile(
r'\[\[(?P<title>[^\]|[#<>{}]*)(\|.*?)?\]\]'),
value)
pywikibot.output(u' cherche %s ' % value)
if True:
try:
#Date treatement
#if claim == "[[P585]]":
#try:
#pywikibot.output(u' Date: %s ' % value)
#pywikibot.output(u' Date: %s ' % parser.parse(value))
#theClaim = pywikibot.Claim(repo, claim[2:-2])
#theClaim.setTarget(parser.parse(value))
#item.addClaim(theClaim)
#except ValueError
#pywikibot.output(u' Impossible to parse this date : %s ' % value)
#continue
#continue
if value[:2] == "[[" and value[-2:] == "]]":
link = value[2:-2]
else:
link = value
#link = match.group(1)
linkedPage = pywikibot.Page(self.site, link)
if linkedPage.isRedirectPage():
linkedPage = linkedPage.getRedirectTarget()
pywikibot.output(u' linkedPage %s ' % linkedPage)
linkedItem = pywikibot.ItemPage.fromPage(linkedPage)
linkedItem.get()
if not linkedItem.exists():
pywikibot.output('%s doesn\'t have a wikidata item :' % linkedPage)
continue
#value constraints treatement
if (claim in self.valuesConstraints) and (linkedItem.getID() not in self.valuesConstraints[claim]):
pywikibot.output(u'The value of the property %s is %s does not respect the constraint %s' %
(claim,
linkedItem.title(),
self.valuesConstraints[claim]))
continue
#pywikibot.output(u' linkedItem %s ' % linkedItem)
pywikibot.output(u' linkedItem.getID() %s ' % linkedItem.title()[1:])
pywikibot.output('Adding %s --> %s'
% (claim,
linkedItem.getID()))
repo = site.data_repository() # utile self.repo existe ?
theClaim = pywikibot.Claim(repo, claim[2:-2])
theClaim.setTarget(linkedItem)
item.addClaim(theClaim)
if self.source:
theClaim.addSource(self.source, bot=True)
except pywikibot.NoPage:
pywikibot.output(
"[[%s]] doesn't exist so I can't link to it"
% linkedItem.title())
except pywikibot.exceptions.InvalidTitle:
pywikibot.output(
"[[%s]] is an invalid title"
% link)
def processCategory(wikipediaSource, template, categoryname, fields, valuesConstraints):
site = pywikibot.Site(wikipediaSource,'wikipedia') # any site will work, this is just an example
hr = HarvestRobot(site, template, fields, valuesConstraints)
category = catlib.Category(site, "%s:%s" % (site.namespace(14), categoryname))
for page in category.articles(recurse=True, startFrom=None):
hr.procesPage(site, page)
def processOnePage(wikipediaSource, template, laPage, fields, valuesConstraints):
site = pywikibot.Site(wikipediaSource,'wikipedia') # any site will work, this is just an example
hr = HarvestRobot(site, template, fields, valuesConstraints)
p =pywikibot.Page(site, laPage)
hr.procesPage(site, p)
templateName = dict()
templateName["en"] = u"Infobox election"
templateName["es"] = u"Ficha de elección"
fields = dict()
valuesConstraints = dict()
fields[u"country"] = "[[P17]]"
fields[u"país"] = "[[P17]]"
valuesConstraints["[[P17]]"] = set(["Q889", "Q222", "Q262", "Q228", "Q916", "Q781", "Q414", "Q399", "Q408", "Q40", "Q227", "Q778", "Q398", "Q902", "Q244", "Q184", "Q31", "Q242", "Q962", "Q917", "Q750", "Q225", "Q963", "Q155", "Q921", "Q219", "Q965", "Q967", "Q424", "Q1009", "Q16", "Q1011", "Q929", "Q657", "Q298", "Q148", "Q739", "Q970", "Q800", "Q1008", "Q224", "Q241", "Q229", "Q213", "Q33946", "Q974", "Q756617", "Q977", "Q784", "Q786", "Q574", "Q736", "Q79", "Q792", "Q983", "Q986", "Q191", "Q115", "Q702", "Q712", "Q33", "Q142", "Q1000", "Q1005", "Q230", "Q183", "Q117", "Q41", "Q769", "Q774", "Q1006", "Q1007", "Q734", "Q790", "Q783", "Q28", "Q189", "Q668", "Q252", "Q794", "Q796", "Q27", "Q801", "Q38", "Q766", "Q17", "Q810", "Q232", "Q114", "Q710", "Q817", "Q813", "Q819", "Q211", "Q822", "Q1013", "Q1014", "Q1016", "Q347", "Q37", "Q32", "Q221", "Q1019", "Q1020", "Q833", "Q826", "Q912", "Q233", "Q709", "Q1025", "Q1027", "Q96", "Q217", "Q711", "Q236", "Q235", "Q1028", "Q1029", "Q836", "Q1030", "Q697", "Q837",
"Q29999", "Q664", "Q811", "Q1032", "Q1033", "Q423", "Q20", "Q842", "Q843", "Q695", "Q804", "Q691", "Q733", "Q419", "Q928", "Q36", "Q45", "Q846", "Q971", "Q218", "Q159", "Q1037", "Q763", "Q760", "Q757", "Q683", "Q238", "Q1039", "Q851", "Q1041", "Q403", "Q1042", "Q1044", "Q334", "Q214", "Q215", "Q685", "Q1045", "Q258", "Q884", "Q958", "Q29", "Q854", "Q1049", "Q730", "Q1050", "Q34", "Q39", "Q858", "Q863", "Q924", "Q869", "Q945", "Q678", "Q754", "Q948", "Q43", "Q874", "Q672", "Q1036", "Q212", "Q878", "Q145", "Q30", "Q77", "Q265", "Q686", "Q237", "Q717", "Q881", "Q180573", "Q805", "Q36704", "Q953", "Q954", "Q28513", "Q2895", "Q618399", "Q179876", "Q5291089", "Q70972", "Q7735661", "Q41304""7318", "Q713750", "Q16957", "Q5555325", "Q161885", "Q5622720", "Q156418", "Q223936", "Q172579", "Q6392538", "Q6741448", "Q7603765", "Q14759030", "Q12560", "Q2006542", "Q211274", "Q27306", "Q34266", "Q2184", "Q230791", "Q14920623", "Q193619", "Q15180", "Q170588", "Q7842409", "Q7877575",
"Q174193", "Q191077", "Q723118", "Q43287", "Q12548", "Q1747689", "Q844930", "Q2429397", "Q23334", "Q865", "Q1246", "Q407199", "Q6250", "Q55", "Q21203", "Q25279", "Q26273", "Q35", "Q223", "Q4628", "Q34020", "Q33788"])
#fields["previous_election"] = "[[P155]]"
#fields["next_election"] = "[[P156]]"
#fields["type"] = "[[P173]]"
valuesConstraints["[[P173]]"] = set(["Q858439", "Q1076105", "Q152450", "Q669262", "Q1128324"])
fields["after_election"] = "[[P991]]"
fields["sucesor"] = "[[P991]]"
#fields["election_date"] = "[[P585]]" #Pas de traitement des dates pour le moment
if sys.argv[3] == "-p":
processOnePage(sys.argv[2], templateName[sys.argv[2]], sys.argv[1].decode("utf8"), fields, valuesConstraints)
else:
#processCategory( u"en", u"Infobox election", u"Presidential elections in Romania", fields, valuesConstraints)
processCategory( sys.argv[2], templateName[sys.argv[2]], sys.argv[1].decode("utf8"), fields, valuesConstraints)