-
Notifications
You must be signed in to change notification settings - Fork 4
/
hewikiReplacebot.py
274 lines (233 loc) · 12 KB
/
hewikiReplacebot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
"""
ReplaceRobotHe is extension of ReplaceRobot.
It is used in Hebrew Wikipedia for doing common replacements according to defintions in a wiki page
These command line parameters can be used to specify which pages to work on:
¶ms;
-xml Retrieve information from a local XML dump (pages-articles
or pages-meta-current, see http://download.wikimedia.org).
Argument can also be given as "-xml:filename".
-summary:XYZ Set the summary message text for the edit to XYZ, bypassing
the predefined message texts with original and replacements
inserted.
-xmlstart (Only works with -xml) Skip all articles in the XML dump
before the one specified (may also be given as
-xmlstart:Article).
-titlecheck A page name to list all pages which their titles violates replacement rules.
The bot will avoid replacements which change titles of existing pages.
"""
#
# (C) Eran Roz
# Distributed under the terms of the MIT license.
#
import re
from collections import OrderedDict
import pywikibot
from pywikibot import i18n
import pywikibot.pagegenerators
try:
import replace
except ImportError:
# usually both scripts directory and pywikibot core should be in PYTHONPATH but if not
import os
import sys
sys.path.append(os.path.abspath(os.path.join(pywikibot.__file__, os.pardir, os.pardir, 'scripts')))
import scripts.replace as replace
import replaceConfig
NO_BOT_REGEX = re.compile(replaceConfig.nobotRgx)
class XmlDumpReplacePageGeneratorHe(replace.XmlDumpReplacePageGenerator):
def __init__(self, replace_dict, xml_filename, xml_start, exceptions, site):
self.replace_dict = replace_dict
replace.XmlDumpReplacePageGenerator.__init__(self, xml_filename, xml_start, replace_dict.values(),
exceptions, site)
def isTextExcepted(self, text):
"""
This is some hack. changing the replacements HERE,
assuming this check is called before replacement for each page
"""
self.replacements = list(get_replacements(self.replace_dict, text))
return super(XmlDumpReplacePageGeneratorHe, self).isTextExcepted(text)
class HeWikiReplacement(replace.Replacement):
def __init__(self, old, new, exceptions=None):
super(HeWikiReplacement, self).__init__(old, new, use_regex=True, exceptions=exceptions)
class ReplaceRobotHe(replace.ReplaceRobot):
""" Robot for common replacement in Hebrew Wikipedia according to known replace page """
def __init__(self, gen, replace_dict, exceptions, edit_summary):
self.replaceDict = replace_dict # replacement dictionary
self.summaryPrefix = edit_summary
replace.ReplaceRobot.__init__(self, gen, self.replaceDict.values(), exceptions, always=True)
""" override regular do replacements by removing disabled replacements according to template,
than the method is the same as the super, but is with specifying specific summary """
def apply_replacements(self, original_text, applied, page=None):
"""
Returns the text which is generated by applying all replacements to
the given text.
"""
self.replacements = list(get_replacements(self.replaceDict, original_text))
return super(ReplaceRobotHe, self).apply_replacements(original_text, applied, page)
def generate_summary(self, applied_replacements):
actucal_replacements = [rep.new.strip() for rep in applied_replacements]
return self.summaryPrefix + ', '.join(actucal_replacements)
def get_replacements(replace_dict, text):
"""
filters disabled replacements from dictionary
"""
disabled = NO_BOT_REGEX.findall(text)
for repId, repRgx in replace_dict.items():
if repId not in disabled:
yield repRgx
def fill_replacements_dict():
"""
fills replacement dictionary according to replace page
"""
site = pywikibot.Site()
page = pywikibot.Page(site, replaceConfig.replacementsPage)
text = page.get()
replace_dict = dict()
if page.lastNonBotUser() not in replaceConfig.whitelist_editors:
raise Exception('Non authorized user edited the replace list. Please verify')
replacement_pattern = "\\|([0-9]+)\n\\|<nowiki>(.*)</nowiki>\n\\|<nowiki>(.*)</nowiki>\n\\|(?:<nowiki>)?(.*?)(?:\n|</nowiki>)"
replacelist = re.findall(replacement_pattern, text)
for x in replacelist:
try:
# compile the regex to check if it is support by python
if x[3] == '':
replacement = HeWikiReplacement(x[1], re.sub('\\$([0-9])', '\\\\\\1', x[2]))
else:
replacement = HeWikiReplacement(x[1], re.sub('\\$([0-9])', '\\\\\\1', x[2]), {'inside': [x[3]]})
replacement.compile(use_regex=True, flags=re.UNICODE)
replace_dict[x[0]] = replacement
except:
# some regexs are written for c# and are ignored by this bot
pywikibot.output('Non supported replacement. ID: %s' % x[0])
pass
# avoid running on empty list - this is probably due to wrong pattern
if not any(replacelist):
raise Exception('Couldnt find replacements in page %s. Expecting pattern: %s' %
(replaceConfig.replacementsPage, replacement_pattern))
return replace_dict
def check_titles(site, report_page_name, replacements):
"""
To avoid breaking links, adds page titles that will be changed to exception list
:param site: site where the bot will run
:param report_page_name: a page name to list of titles adds to exception
:param replacements: dictionary of replacements
"""
from pywikibot import textlib
from pywikibot.tools.itertools import itergroup
all_pages = site.allpages(namespace=0, filterredir=False, content=False)
evaluation_progress = 0
exceptions_dict = {}
for titles_group in itergroup(all_pages, all_pages.query_limit):
titles_group_t = [p.title(as_link=True, with_section=False) for p in titles_group]
old_titles = titles_group_t
evaluation_progress += len(titles_group_t)
if evaluation_progress % 20000 == 0: print('\r%i page titles processed' % evaluation_progress)
old_text = ' \n '.join(titles_group_t)
for replacement_key, replacement in replacements.items():
replacement_exceptions = replacement.exceptions or {}
replacement_exceptions_inside = replacement_exceptions.get('inside', [])
new_text = textlib.replaceExcept(
old_text, replacement.old_regex, replacement.new,
replacement_exceptions_inside,
site=site)
# replacement change valid title
changed_titles = ((old_title, new_title) for old_title, new_title in zip(old_titles, new_text.split(' \n '))
if old_title != new_title and
old_title != '[[%s' % pywikibot.tools.first_upper(new_title[2:])) # breaks link
# no special treat for link
changed_titles = ((old_title, new_title) for old_title, new_title in changed_titles
if replacement.old_regex.sub(replacement.new, ' %s ' % old_title[2:-2]) != ' %s ' % old_title[2:-2])
# valid title is not disambig
changed_titles = [old_title[2:-2] for old_title, new_title in changed_titles
if not pywikibot.Page(site, old_title[2:-2]).isDisambig()
]
if len(changed_titles) > 0:
#changed_titles_exceptions = [re.compile(re.escape(title), re.U) for title in changed_titles]
changed_titles_exceptions = [re.compile('\[\[%s\|.+?\]\]|%s' % (re.escape(title), re.escape(title)), re.U)
for title in changed_titles]
replacement_exceptions['inside'] = replacement_exceptions_inside + changed_titles_exceptions
replacement.exceptions = replacement_exceptions
if replacement_key not in exceptions_dict:
exceptions_dict[replacement_key] = []
exceptions_dict[replacement_key] += changed_titles
exceptions_dict = OrderedDict(sorted((int(k), v) for k, v in exceptions_dict.items()))
report_page = pywikibot.Page(site, report_page_name)
exception_report = ''
for replace_key, replaced_titles in exceptions_dict.items():
exception_report += '\n* %i\n%s' % (replace_key, '\n'.join(['** [[%s]]' % t for t in replaced_titles]))
report_page.put(exception_report, summary='עדכון')
def main(*args):
pywikibot.output('Starting hewiki-replacebot')
edit_summary = replaceConfig.defaultSummary
xml_filename = None
xml_start = None
title_check_page = None
gen = None
gen_factory = pywikibot.pagegenerators.GeneratorFactory()
local_args = pywikibot.handle_args(args)
for arg in local_args:
if gen_factory.handle_arg(arg):
continue
elif arg.startswith('-summary:'):
edit_summary = arg[9:]
elif arg.startswith('-xmlstart'):
if len(arg) == 9:
xml_start = pywikibot.input('Please enter the dumped article to start with:')
else:
xml_start = arg[10:]
elif arg.startswith('-xml'):
if len(arg) == 4:
xml_filename = i18n.input('pywikibot-enter-xml-filename')
else:
xml_filename = arg[5:]
elif arg.startswith('-titlecheck'):
title_check_page = arg[12:]
replace_dict = fill_replacements_dict()
safe_templates = replaceConfig.safeTemplates
# add external links templates
site = pywikibot.Site()
site.login()
for safeCategory in replaceConfig.safeTemplatesCategories:
cite_templates = pywikibot.Category(site, safeCategory).articles(namespaces=10, recurse=True)
cite_templates = [page.title(with_ns=False) for page in cite_templates]
safe_templates += cite_templates
safe_templates = list(set(a for a in safe_templates if '/' not in a))
file_usage_rgx = re.compile(replaceConfig.fileUsageRgx, re.I)
yi_rgx = re.compile('\[\[yi:.*?\]\]')
safe_templates_rgx = re.compile('\{\{(' + '|'.join(set(safe_templates)) + ').*?\}\}', re.I)
exceptions = {
'title': [],
'text-contains': [re.compile(replaceConfig.redirectRgx, re.I)],
'inside': [file_usage_rgx, safe_templates_rgx, yi_rgx],
'inside-tags': ['nowiki', 'math', 'comment', 'pre', 'source', 'hyperlink', 'gallery', 'interwiki',
'templatedata', 'syntaxhighlight'],
'require-title': [],
}
# avoid searching in other namespaces in the xml
exceptions_with_title_ns = dict(exceptions)
exceptions_with_title_ns['title'] = [re.compile('^' + re.escape(ns_name) + ':') for ns_index, ns
in site.namespaces.items() if ns_index not in replaceConfig.namespaces
for ns_name in ns]
if title_check_page:
check_titles(site, title_check_page, replace_dict)
if xml_filename:
gen = XmlDumpReplacePageGeneratorHe(replace_dict, xml_filename, xml_start, exceptions_with_title_ns, site)
gen = gen_factory.getCombinedGenerator(gen)
if not gen:
pywikibot.output('no xml dump specified. please fill -xml and the xml file to be used, or other generator')
pywikibot.bot.suggest_help(missing_generator=True)
return False
gen = pywikibot.pagegenerators.NamespaceFilterPageGenerator(gen, replaceConfig.namespaces, site)
gen = pywikibot.pagegenerators.PreloadingGenerator(gen)
pywikibot.output('starting replace')
bot = ReplaceRobotHe(gen, replace_dict, exceptions, edit_summary)
bot.run()
pywikibot.output('finished all replacements')
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()