def translateAndCapitalizeNamespaces(self, text): """Use localized namespace names.""" # arz uses english stylish codes if self.site.sitename == 'wikipedia:arz': return text # wiki links aren't parsed here. exceptions = ['nowiki', 'comment', 'math', 'pre'] for namespace in self.site.namespaces.values(): if namespace == 0: # skip main (article) namespace continue # a clone is needed. Won't change the namespace dict namespaces = list(namespace) if namespace == 6 and self.site.family.name == 'wikipedia': if self.site.code in ('en', 'fr') and MediaWikiVersion( self.site.version()) >= MediaWikiVersion('1.14'): # do not change "Image" on en-wiki and fr-wiki assert u'Image' in namespaces namespaces.remove(u'Image') if self.site.code == 'hu': # do not change "Kép" on hu-wiki assert u'Kép' in namespaces namespaces.remove(u'Kép') elif self.site.code == 'pt': # use "Imagem" by default on pt-wiki (per T57242) assert 'Imagem' in namespaces namespaces.insert( 0, namespaces.pop(namespaces.index('Imagem'))) # final namespace variant final_ns = namespaces.pop(0) if namespace in (2, 3): # skip localized user namespace, maybe gender is used namespaces = ['User' if namespace == 2 else 'User talk'] # lowerspaced and underscored namespaces for i, item in enumerate(namespaces): item = item.replace(' ', '[ _]') item = u'[%s%s]' % (item[0], item[0].lower()) + item[1:] namespaces[i] = item namespaces.append(first_lower(final_ns)) if final_ns and namespaces: if self.site.sitename == 'wikipedia:pt' and namespace == 6: # only change on these file extensions (per T57242) extensions = ('png', 'gif', 'jpg', 'jpeg', 'svg', 'tiff', 'tif') text = textlib.replaceExcept( text, r'\[\[\s*({}) *:(?P<name>[^\|\]]*?\.({}))' r'(?P<label>.*?)\]\]' .format('|'.join(namespaces), '|'.join(extensions)), r'[[{}:\g<name>\g<label>]]'.format(final_ns), exceptions) else: text = textlib.replaceExcept( text, r'\[\[\s*(%s) *:(?P<nameAndLabel>.*?)\]\]' % '|'.join(namespaces), r'[[%s:\g<nameAndLabel>]]' % final_ns, exceptions) return text
def fixHtml(self, text): """Relace html markups with wikitext markups.""" def replace_header(match): """Create a header string for replacing.""" depth = int(match.group(1)) return r'{0} {1} {0}'.format('=' * depth, match.group(2)) # Everything case-insensitive (?i) # Keep in mind that MediaWiki automatically converts <br> to <br /> exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>', r"'''\2'''", exceptions, site=self.site) text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>', r"''\2''", exceptions, site=self.site) # horizontal line without attributes in a single line text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2', exceptions) # horizontal line with attributes; can't be done with wiki syntax # so we only make it XHTML compliant text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 />', exceptions) # a header where only spaces are in the same line text = textlib.replaceExcept( text, r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])', replace_header, exceptions) # TODO: maybe we can make the bot replace <p> tags with \r\n's. return text
def fixSyntaxSave(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # link to the wiki working on # TODO: disable this for difflinks and titled links, # to prevent edits like this: # https://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&diff=103109563&oldid=103109271 # text = textlib.replaceExcept(text, # r'\[https?://%s\.%s\.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?\]' # % (self.site.code, self.site.family.name), # r'[[\g<link>|\g<title>]]', exceptions) # external link in double brackets text = textlib.replaceExcept( text, r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions) # external link starting with double bracket text = textlib.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]', r'[\g<url>]', exceptions) # external link and description separated by a dash, with # whitespace in front of the dash, so that it is clear that # the dash is not a legitimate part of the URL. text = textlib.replaceExcept( text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. text = textlib.replaceExcept( text, r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) return text
def fixSyntaxSave(self, text): def replace_link(match): replacement = '[[' + match.group('link') if match.group('title'): replacement += '|' + match.group('title') return replacement + ']]' exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # link to the wiki working on # Only use suffixes for article paths for suffix in self.site._interwiki_urls(True): http_url = self.site.base_url(suffix, 'http') if self.site.protocol() == 'http': https_url = None else: https_url = self.site.base_url(suffix, 'https') # compare strings without the protocol, if they are empty support # also no prefix (//en.wikipedia.org/…) if https_url is not None and http_url[4:] == https_url[5:]: urls = ['(?:https?:)?' + re.escape(http_url[5:])] else: urls = [re.escape(url) for url in (http_url, https_url) if url is not None] for url in urls: # Only include links which don't include the separator as # the wikilink won't support additional parameters separator = '?' if '?' in suffix: separator += '&' # Match first a non space in the title to prevent that multiple # spaces at the end without title will be matched by it text = textlib.replaceExcept( text, r'\[\[?' + url + r'(?P<link>[^' + separator + r']+?)' r'(\s+(?P<title>[^\s].*?))?\s*\]\]?', replace_link, exceptions, site=self.site) # external link in/starting with double brackets text = textlib.replaceExcept( text, r'\[\[(?P<url>https?://[^\]]+?)\]\]?', r'[\g<url>]', exceptions, site=self.site) # external link and description separated by a pipe, with # whitespace in front of the pipe, so that it is clear that # the dash is not a legitimate part of the URL. text = textlib.replaceExcept( text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. extensions = [r'\.{0}'.format(ext) for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']] text = textlib.replaceExcept( text, r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *' r'\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) return text
def test_replace_template(self): """Test replacing not inside templates.""" template_sample = (r'a {{templatename ' r' | accessdate={{Fecha|1993}} ' r' |atitle=The [[real title]] }}') self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:]) template_sample = (r'a {{templatename ' r' | 1={{a}}2{{a}} ' r' | 2={{a}}1{{a}} }}') self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:]) template_sample = (r'a {{templatename ' r' | 1={{{a}}}2{{{a}}} ' r' | 2={{{a}}}1{{{a}}} }}') self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:]) # sf.net bug 1575: unclosed template template_sample = template_sample[:-2] self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:])
def fixHtml(self, text): # Everything case-insensitive (?i) # Keep in mind that MediaWiki automatically converts <br> to <br /> exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>', r"'''\2'''", exceptions, site=self.site) text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>', r"''\2''", exceptions, site=self.site) # horizontal line without attributes in a single line text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2', exceptions) # horizontal line with attributes; can't be done with wiki syntax # so we only make it XHTML compliant text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 />', exceptions) # a header where only spaces are in the same line for level in range(1, 7): equals = '\\1%s \\2 %s\\3' % ("=" * level, "=" * level) text = textlib.replaceExcept( text, r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])' % (level, level), r'%s' % equals, exceptions) # TODO: maybe we can make the bot replace <p> tags with \r\n's. return text
def commonsfiledesc(self, text): """ Clean up file descriptions on the Wikimedia Commons. It is working according to [1] and works only on pages in the file namespace on the Wikimedia Commons. [1]: https://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup """ if self.site.sitename != 'commons:commons' or self.namespace == 6: return # section headers to {{int:}} versions exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki', 'pre', 'source', 'ref', 'timeline'] text = textlib.replaceExcept(text, r"([\r\n]|^)\=\= *Summary *\=\=", r"\1== {{int:filedesc}} ==", exceptions, True) text = textlib.replaceExcept( text, r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=", r"\1== {{int:license-header}} ==", exceptions, True) text = textlib.replaceExcept( text, r"([\r\n])\=\= *(Licensing|License information|{{int:license}}) *\=\=", r"\1== {{int:license-header}} ==", exceptions, True) # frequent field values to {{int:}} versions text = textlib.replaceExcept( text, r'([\r\n]\|[Ss]ource *\= *)' r'(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *([\r\n])', r'\1{{own}}\2', exceptions, True) text = textlib.replaceExcept( text, r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])', r'\1\2', exceptions, True) # added to transwikied pages text = textlib.replaceExcept(text, r'__NOTOC__', '', exceptions, True) # tracker element for js upload form text = textlib.replaceExcept( text, r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->', '', exceptions[1:], True) text = textlib.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}', '', exceptions, True) # duplicated section headers text = textlib.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *{{int:filedesc}} *\=\=', r'\1== {{int:filedesc}} ==', exceptions, True) text = textlib.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)' r'\=\= *{{int:license-header}} *\=\=', r'\1== {{int:license-header}} ==', exceptions, True) return text
def removeUselessSpaces(self, text): multipleSpacesR = re.compile(' +') spaceAtLineEndR = re.compile(' $') exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table', 'template'] text = textlib.replaceExcept(text, multipleSpacesR, ' ', exceptions) text = textlib.replaceExcept(text, spaceAtLineEndR, '', exceptions) return text
def test_replace_source_reference(self): """Test replacing in text which contains back references.""" # Don't use a valid reference number in the original string, in case it # tries to apply that as a reference. self.assertEqual(textlib.replaceExcept(r"\42", r"^(.*)$", r"X\1X", [], site=self.site), r"X\42X") self.assertEqual( textlib.replaceExcept(r"\g<bar>", r"^(?P<foo>.*)$", r"X\g<foo>X", [], site=self.site), r"X\g<bar>X" )
def test_replace_exception(self): self.assertEqual(textlib.replaceExcept('123x123', '123', '000', [], site=self.site), '000x000') self.assertEqual(textlib.replaceExcept('123x123', '123', '000', [re.compile(r'\w123')], site=self.site), '000x123')
def test_overlapping_replace(self): self.assertEqual(textlib.replaceExcept('1111', '11', '21', [], allowoverlap=False, site=self.site), '2121') self.assertEqual(textlib.replaceExcept('1111', '11', '21', [], allowoverlap=True, site=self.site), '2221')
def test_replace_with_marker(self): self.assertEqual(textlib.replaceExcept('AxyxB', 'x', 'y', [], marker='.', site=self.site), 'Ayyy.B') self.assertEqual(textlib.replaceExcept('AxyxB', '1', 'y', [], marker='.', site=self.site), 'AxyxB.')
def test_replace_exception(self): """Test replacing not inside a specific regex.""" self.assertEqual(textlib.replaceExcept('123x123', '123', '000', [], site=self.site), '000x000') self.assertEqual(textlib.replaceExcept('123x123', '123', '000', [re.compile(r'\w123')], site=self.site), '000x123')
def test_overlapping_replace(self): """Test replacing with and without overlap.""" self.assertEqual(textlib.replaceExcept('1111', '11', '21', [], allowoverlap=False, site=self.site), '2121') self.assertEqual(textlib.replaceExcept('1111', '11', '21', [], allowoverlap=True, site=self.site), '2221')
def test_simple_replace(self): self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [], site=self.site), 'AyB') self.assertEqual(textlib.replaceExcept('AxxB', 'x', 'y', [], site=self.site), 'AyyB') self.assertEqual(textlib.replaceExcept('AxyxB', 'x', 'y', [], site=self.site), 'AyyyB')
def test_replace_tags_interwiki(self): if "es" not in self.site.family.langs or "ey" in self.site.family.langs: raise unittest.SkipTest("family %s doesnt have languages" % self.site) self.assertEqual( textlib.replaceExcept("[[es:s]]", "s", "t", ["interwiki"], site=self.site), "[[es:s]]" ) # "es" is a valid interwiki code self.assertEqual( textlib.replaceExcept("[[ex:x]]", "x", "y", ["interwiki"], site=self.site), "[[ey:y]]" ) # "ex" is not a valid interwiki code
def removeUselessSpaces(self, text): """Cleanup multiple or trailing spaces.""" multipleSpacesR = re.compile(' +') spaceAtLineEndR = re.compile(' $') exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table'] if self.site.sitename != 'wikipedia:cs': exceptions.append('template') text = textlib.replaceExcept(text, multipleSpacesR, ' ', exceptions) text = textlib.replaceExcept(text, spaceAtLineEndR, '', exceptions) return text
def test_simple_replace(self): """Test replacing without regex.""" self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [], site=self.site), 'AyB') self.assertEqual(textlib.replaceExcept('AxxB', 'x', 'y', [], site=self.site), 'AyyB') self.assertEqual(textlib.replaceExcept('AxyxB', 'x', 'y', [], site=self.site), 'AyyyB')
def test_replace_tags_interwiki(self): if 'es' not in self.site.family.langs or 'ey' in self.site.family.langs: raise unittest.SkipTest('family %s doesnt have languages' % self.site) self.assertEqual(textlib.replaceExcept('[[es:s]]', 's', 't', ['interwiki'], site=self.site), '[[es:s]]') # "es" is a valid interwiki code self.assertEqual(textlib.replaceExcept('[[ex:x]]', 'x', 'y', ['interwiki'], site=self.site), '[[ey:y]]') # "ex" is not a valid interwiki code
def fixReferences(self, text): """Fix references tags.""" # See also https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm exceptions = ["nowiki", "comment", "math", "pre", "source", "startspace"] # it should be name = " or name=" NOT name =" text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text) # remove empty <ref/>-tag text = textlib.replaceExcept(text, r"(?i)(<ref\s*/>|<ref *>\s*</ref>)", r"", exceptions) text = textlib.replaceExcept(text, r"(?i)<ref\s+([^>]+?)\s*>\s*</ref>", r"<ref \1/>", exceptions) return text
def fixReferences(self, text): # See also https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # it should be name = " or name=" NOT name =" text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text) # remove empty <ref/>-tag text = textlib.replaceExcept(text, r'(?i)(<ref\s*/>|<ref *>\s*</ref>)', r'', exceptions) text = textlib.replaceExcept(text, r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>', r'<ref \1/>', exceptions) return text
def removeUselessSpaces(self, text): """Cleanup multiple or trailing spaces.""" exceptions = ["comment", "math", "nowiki", "pre", "startspace", "table"] if self.site.sitename != "wikipedia:cs": exceptions.append("template") text = textlib.replaceExcept(text, r"(?m) +( |$)", r"\1", exceptions, site=self.site) return text
def translateMagicWords(self, text): """Use localized magic words.""" # not wanted at ru # arz uses english stylish codes if self.site.code not in ["arz", "ru"]: exceptions = ["nowiki", "comment", "math", "pre"] for magicWord in [ "img_thumbnail", "img_left", "img_center", "img_right", "img_none", "img_framed", "img_frameless", "img_border", "img_upright", ]: aliases = self.site.getmagicwords(magicWord) if not aliases: continue text = textlib.replaceExcept( text, r"\[\[(?P<left>.+?:.+?\..+?\|) *(" + "|".join(aliases) + r") *(?P<right>(\|.*?)?\]\])", r"[[\g<left>" + aliases[0] + r"\g<right>", exceptions, ) return text
def putSpacesInLists(self, text): """ Add a space between the * or # and the text. NOTE: This space is recommended in the syntax help on the English, German, and French Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ if not self.template: exceptions = [ "comment", "math", "nowiki", "pre", "source", "template", "timeline", self.site.redirectRegex(), ] text = textlib.replaceExcept( text, r"(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)", r"\g<bullet> \g<char>", exceptions, ) return text
def fixTypo(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace', 'gallery', 'hyperlink', 'interwiki', 'link'] # change <number> ccm -> <number> cm³ text = textlib.replaceExcept(text, r'(\d)\s*(?: )?ccm', r'\1 cm³', exceptions, site=self.site) # Solve wrong Nº sign with °C or °F # additional exception requested on fr-wiki for this stuff pattern = re.compile(u'«.*?»', re.UNICODE) exceptions.append(pattern) text = textlib.replaceExcept(text, r'(\d)\s*(?: )?[º°]([CF])', r'\1 °\2', exceptions, site=self.site) text = textlib.replaceExcept(text, u'º([CF])', u'°' + r'\1', exceptions) return text
def apply(self, text, page): exceptions = list(set(self.exceptions + [ 'imagemap', 'includeonly', 'timeline']) - {'startspace'}) title = page.title() return textlib.replaceExcept( text, r"(?P<before>''')?\[\[(?P<inside>[^]]+)\]\](?P<after>''')?", lambda m: self.replacement(m, title), exceptions, site=page.site)
def __iter__(self): """Iterator method.""" try: for entry in self.parser: if self.skipping: if entry.title != self.xmlStart: continue self.skipping = False if self.isTitleExcepted(entry.title) or self.isTextExcepted(entry.text): continue new_text = entry.text for replacement in self.replacements: # This doesn't do an actual replacement but just # checks if at least one does apply new_text = textlib.replaceExcept( new_text, replacement.old_regex, replacement.new, self.excsInside + replacement.get_inside_exceptions(), site=self.site, ) if new_text != entry.text: yield pywikibot.Page(self.site, entry.title) except KeyboardInterrupt: try: if not self.skipping: pywikibot.output('To resume, use "-xmlstart:%s" on the command line.' % entry.title) except NameError: pass
def translateMagicWords(self, text): """Use localized magic words.""" # not wanted at ru # arz uses english stylish codes # no need to run on English wikis if self.site.code not in ['arz', 'en', 'ru']: def replace_magicword(match): split = match.group().split('|') # push ']]' out and re-add below split[-1] = split[-1][:-2] for magicword in ['img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', 'img_framed', 'img_frameless', 'img_border', 'img_upright', ]: aliases = list(self.site.getmagicwords(magicword)) preferred = aliases.pop(0) if not aliases: continue split[1:] = list(map( lambda x: preferred if x.strip() in aliases else x, split[1:])) return '|'.join(split) + ']]' exceptions = ['nowiki', 'comment', 'math', 'pre', 'source'] regex = re.compile( FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]), flags=re.X) text = textlib.replaceExcept(text, regex, replace_magicword, exceptions) return text
def test_case_sensitive(self): self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [], caseInsensitive=False, site=self.site), 'AyB') self.assertEqual(textlib.replaceExcept('AxB', 'X', 'y', [], caseInsensitive=False, site=self.site), 'AxB') self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [], caseInsensitive=True, site=self.site), 'AyB') self.assertEqual(textlib.replaceExcept('AxB', 'X', 'y', [], caseInsensitive=True, site=self.site), 'AyB')
def removeUselessSpaces(self, text): """Cleanup multiple or trailing spaces.""" exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table'] if self.site.sitename != 'wikipedia:cs': exceptions.append('template') text = textlib.replaceExcept(text, r'(?m) +( |$)', r'\1', exceptions, site=self.site) return text
def removeUselessSpaces(self, text): """Cleanup multiple or trailing spaces.""" exceptions = [ 'comment', 'math', 'nowiki', 'pre', 'syntaxhighlight', 'startspace', 'table' ] if self.site.sitename != 'wikipedia:cs': exceptions.append('template') text = textlib.replaceExcept(text, r'(?m)[\t ]+( |$)', r'\1', exceptions, site=self.site) return text
def fixTypo(self, text): exceptions = [ 'nowiki', 'comment', 'math', 'pre', 'source', 'startspace', 'gallery', 'hyperlink', 'interwiki', 'link' ] # change <number> ccm -> <number> cm³ text = textlib.replaceExcept(text, r'(\d)\s*(?: )?ccm', r'\1 cm³', exceptions, site=self.site) # Solve wrong Nº sign with °C or °F # additional exception requested on fr-wiki for this stuff pattern = re.compile(u'«.*?»', re.UNICODE) exceptions.append(pattern) text = textlib.replaceExcept(text, r'(\d)\s*(?: )?[º°]([CF])', r'\1 °\2', exceptions, site=self.site) text = textlib.replaceExcept(text, u'º([CF])', u'°' + r'\1', exceptions) return text
def cleanUpSectionHeaders(self, text): """ Add a space between the equal signs and the section title. Example: ==Section title== becomes == Section title == NOTE: This space is recommended in the syntax help on the English and German Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ return textlib.replaceExcept( text, r'(?m)^(={1,7}) *(?P<title>[^=]+?) *\1 *\r?\n', r'\1 \g<title> \1%s' % config.LS, ['comment', 'math', 'nowiki', 'pre'])
def fixTypo(self, text: str) -> str: """Fix units.""" exceptions = [ 'comment', 'gallery', 'hyperlink', 'interwiki', 'link', 'nowiki', 'math', 'pre', 'startspace', 'syntaxhighlight', ] # type: List[Union[str, Pattern[str]]] # change <number> ccm -> <number> cm³ text = textlib.replaceExcept(text, r'(\d)\s*(?: )?ccm', r'\1 cm³', exceptions, site=self.site) # Solve wrong Nº sign with °C or °F # additional exception requested on fr-wiki for this stuff pattern = re.compile('«.*?»') exceptions.append(pattern) text = textlib.replaceExcept(text, r'(\d)\s*(?: )?[º°]([CF])', r'\1 °\2', exceptions, site=self.site) text = textlib.replaceExcept(text, 'º([CF])', '°' + r'\1', exceptions, site=self.site) return text
def test_replace_template(self): """Test replacing not inside templates.""" template_sample = (r'a {{templatename ' r' | accessdate={{Fecha|1993}} ' r' |atitle=The [[real title]] }}') self.assertEqual( textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:]) template_sample = (r'a {{templatename ' r' | 1={{a}}2{{a}} ' r' | 2={{a}}1{{a}} }}') self.assertEqual( textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:]) template_sample = (r'a {{templatename ' r' | 1={{{a}}}2{{{a}}} ' r' | 2={{{a}}}1{{{a}}} }}') self.assertEqual( textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:]) # sf.net bug 1575: unclosed template template_sample = template_sample[:-2] self.assertEqual( textlib.replaceExcept(template_sample, 'a', 'X', ['template'], site=self.site), 'X' + template_sample[1:])
def apply(self, text, replaced=list()): hook = lambda match: self.summary_hook(match, replaced) start = time.clock() text = textlib.replaceExcept(text, self.find, hook, self.exceptions, site=self.site) finish = time.clock() delta = finish - start self.longest = max(delta, self.longest) if delta > 5: pywikibot.warning('Slow typo rule "%s" (%f)' % (self.find.pattern, delta)) return text
def apply_replacements(self, original_text, applied, page=None): """ Apply all replacements to the given text. @rtype: unicode, set """ if page is None: pywikibot.warn( 'You must pass the target page as the "page" parameter to ' 'apply_replacements().', DeprecationWarning, stacklevel=2) new_text = original_text exceptions = _get_text_exceptions(self.exceptions) skipped_containers = set() for replacement in self.replacements: if self.sleep is not None: time.sleep(self.sleep) if (replacement.container and replacement.container.name in skipped_containers): continue elif page is not None and self.isTitleExcepted( page.title(), replacement.exceptions): if replacement.container: pywikibot.output( 'Skipping fix "{0}" on {1} because the title is on ' 'the exceptions list.'.format( replacement.container.name, page.title(as_link=True))) skipped_containers.add(replacement.container.name) else: pywikibot.output( 'Skipping unnamed replacement ({0}) on {1} because ' 'the title is on the exceptions list.'.format( replacement.description, page.title(as_link=True))) continue old_text = new_text new_text = textlib.replaceExcept( new_text, replacement.old_regex, replacement.new, exceptions + replacement.get_inside_exceptions(), allowoverlap=self.allowoverlap, site=self.site) if old_text != new_text: applied.add(replacement) return new_text
def treat_page(self): """Process one page.""" self.check_disabled() text = '' for section in split_into_sections(self.current_page.text): for identifier in ('ISBN', 'PMID', 'RFC'): if self.getOption(identifier): section = replaceExcept( section, _regexes[identifier], self.getOption(identifier), self.replace_exceptions, site=self.site, ) text += section self.put_current(text, summary=self.getOption('summary'))
def cleanUpSectionHeaders(self, text): """ Add a space between the equal signs and the section title. Example: ==Section title== becomes == Section title == NOTE: This space is recommended in the syntax help on the English and German Wikipedia. It is not wanted on Lojban and English Wiktionary (T168399, T169064) and it might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ if self.site.sitename in ['wiktionary:jbo', 'wiktionary:en']: return text return textlib.replaceExcept( text, r'(?m)^(={1,6})[ \t]*(?P<title>.*[^\s=])[ \t]*\1[ \t]*\r?\n', r'\1 \g<title> \1\n', ['comment', 'math', 'nowiki', 'pre'])
def usersToCheck(): optInPage = pywikibot.Page(pywikibot.Site(), wpOptInList) optInRawText = optInPage.get() p = re.compile(wpOptInListRegEx, re.UNICODE) userIterator = p.finditer(optInRawText) result = [] for user in userIterator: # "_" is the same as " " for Wikipedia URls username = textlib.replaceExcept(user.group('username'), u"_", u" ", []) if len(username) == 1: username = username[0].capitalize() elif len(username) > 1: username = username[0].capitalize() + username[1:] result.append(username) return result
def putSpacesInLists(self, text): """ Add a space between the * or # and the text. NOTE: This space is recommended in the syntax help on the English, German, and French Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ if not self.template: exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'template', 'timeline', self.site.redirectRegex()] text = textlib.replaceExcept( text, r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', r'\g<bullet> \g<char>', exceptions) return text
def doReplacements(self, original_text): """ Returns the text which is generated by applying all replacements to the given text. """ new_text = original_text exceptions = [] if "inside-tags" in self.exceptions: exceptions += self.exceptions['inside-tags'] if "inside" in self.exceptions: exceptions += self.exceptions['inside'] for old, new in self.replacements: if self.sleep is not None: time.sleep(self.sleep) new_text = textlib.replaceExcept(new_text, old, new, exceptions, allowoverlap=self.allowoverlap, site=self.site) return new_text
def replaceDeprecatedTemplates(self, text): exceptions = ['comment', 'math', 'nowiki', 'pre'] if self.site.family.name in deprecatedTemplates and \ self.site.code in deprecatedTemplates[self.site.family.name]: for template in deprecatedTemplates[ self.site.family.name][self.site.code]: old = template[0] new = template[1] if new is None: new = '' else: new = '{{%s}}' % new if self.site.namespaces[10].case == 'first-letter': old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:] text = textlib.replaceExcept( text, r'\{\{([mM][sS][gG]:)?%s(?P<parameters>\|[^}]+|)}}' % old, new, exceptions)
def translateMagicWords(self, text): """Use localized magic words.""" # not wanted at ru # arz uses english stylish codes if self.site.code not in ['arz', 'ru']: exceptions = ['nowiki', 'comment', 'math', 'pre'] for magicWord in ['img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', 'img_framed', 'img_frameless', 'img_border', 'img_upright', ]: aliases = self.site.getmagicwords(magicWord) if not aliases: continue text = textlib.replaceExcept( text, r'\[\[(?P<left>.+?:.+?\..+?\|) *(' + '|'.join(aliases) + r') *(?P<right>(\|.*?)?\]\])', r'[[\g<left>' + aliases[0] + r'\g<right>', exceptions) return text
def translateMagicWords(self, text: str) -> str: """Use localized magic words.""" # not wanted at ru # arz uses English stylish codes # no need to run on English wikis if self.site.code in ['arz', 'en', 'ru']: return text def init_cache() -> None: for magicword in ('img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', 'img_framed', 'img_frameless', 'img_border', 'img_upright', 'img_baseline', 'img_sub', 'img_super', 'img_top', 'img_text_top', 'img_middle', 'img_bottom', 'img_text_bottom'): aliases = self.site.getmagicwords(magicword) if len(aliases) > 1: cache.update((alias, aliases[0]) for alias in aliases[1:] if '$1' not in alias) if not cache: cache[False] = True # signal there is nothing to replace def replace_magicword(match: Match[str]) -> str: if cache.get(False): return match.group() split = match.group().split('|') if len(split) == 1: return match.group() if not cache: init_cache() # push ']]' out and re-add below split[-1] = split[-1][:-2] return '{}|{}]]'.format( split[0], '|'.join(cache.get(x.strip(), x) for x in split[1:])) cache = {} # type: Dict[Union[bool, str], Any] exceptions = ['comment', 'nowiki', 'pre', 'syntaxhighlight'] regex = re.compile( FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]), flags=re.X) return textlib.replaceExcept( text, regex, replace_magicword, exceptions)
def translateAndCapitalizeNamespaces(self, text): """Use localized namespace names.""" # arz uses english stylish codes if self.site.sitename() == 'wikipedia:arz': return text family = self.site.family # wiki links aren't parsed here. exceptions = ['nowiki', 'comment', 'math', 'pre'] for nsNumber in self.site.namespaces(): if nsNumber in (0, 2, 3): # skip main (article) namespace # skip user namespace, maybe gender is used continue # a clone is needed. Won't change the namespace dict namespaces = list(self.site.namespace(nsNumber, all=True)) thisNs = namespaces.pop(0) if nsNumber == 6 and family.name == 'wikipedia': if self.site.code in ('en', 'fr') and \ MediaWikiVersion(self.site.version()) >= MediaWikiVersion('1.14'): # do not change "Image" on en-wiki and fr-wiki assert u'Image' in namespaces namespaces.remove(u'Image') if self.site.code == 'hu': # do not change "Kép" on hu-wiki assert u'Kép' in namespaces namespaces.remove(u'Kép') elif self.site.code == 'pt': # bug 55242 should be implemented continue # lowerspaced and underscored namespaces for i in range(len(namespaces)): item = namespaces[i].replace(' ', '[ _]') item = u'[%s%s]' % (item[0], item[0].lower()) + item[1:] namespaces[i] = item namespaces.append(thisNs[0].lower() + thisNs[1:]) if thisNs and namespaces: text = textlib.replaceExcept( text, r'\[\[\s*(%s) *:(?P<nameAndLabel>.*?)\]\]' % '|'.join(namespaces), r'[[%s:\g<nameAndLabel>]]' % thisNs, exceptions) return text
def treat_page(self) -> None: """Process one page.""" cats = list() old_cat_link = None wikicode = mwparserfromhell.parse(self.current_page.text, skip_style_tags=True) for link in wikicode.ifilter_wikilinks(): if link.title.strip().startswith(':'): continue try: link_page = pywikibot.Page(self.site, str(link.title)) link_cat = pywikibot.Category(link_page) except (ValueError, pywikibot.Error): continue cats.append(link_cat) if link_cat == self.getOption('old_cat'): old_cat_link = link if not old_cat_link: pywikibot.log('Did not find {} in {}.'.format( self.getOption('old_cat'), self.current_page)) return new_cats = self.getOption('new_cats') if len(new_cats) == 1 and new_cats[0] not in cats: # Update the title to keep the sort key. old_cat_link.title = new_cats[0].title() text = str(wikicode) else: for cat in new_cats: if cat not in cats: wikicode.insert_after(old_cat_link, '\n' + cat.aslink()) old_cat_regex = re.compile(r'\n?' + re.escape(str(old_cat_link)), re.M) text = replaceExcept(str(wikicode), old_cat_regex, '', EXCEPTIONS, site=self.site) self.put_current( text, summary=self.getOption('summary'), asynchronous=False, nocreate=True, )
def replaceDeprecatedTemplates(self, text): """Replace deprecated templates.""" exceptions = ['comment', 'math', 'nowiki', 'pre'] builder = _MultiTemplateMatchBuilder(self.site) if self.site.family.name in deprecatedTemplates and \ self.site.code in deprecatedTemplates[self.site.family.name]: for template in deprecatedTemplates[self.site.family.name][ self.site.code]: old, new = template if new is None: new = '' else: new = '{{%s}}' % new text = textlib.replaceExcept(text, builder.pattern(old), new, exceptions) return text
def translateMagicWords(self, text): """Use localized magic words.""" # not wanted at ru # arz uses english stylish codes # no need to run on English wikis if self.site.code not in ['arz', 'en', 'ru']: def replace_magicword(match): split = match.group().split('|') # push ']]' out and re-add below split[-1] = split[-1][:-2] for magicword in [ 'img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', 'img_framed', 'img_frameless', 'img_border', 'img_upright', ]: aliases = list(self.site.getmagicwords(magicword)) preferred = aliases.pop(0) if not aliases: continue split[1:] = list( map(lambda x: preferred if x.strip() in aliases else x, split[1:])) return '|'.join(split) + ']]' exceptions = ['nowiki', 'comment', 'math', 'pre', 'source'] regex = re.compile(FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]), flags=re.X) text = textlib.replaceExcept(text, regex, replace_magicword, exceptions) return text
def __iter__(self): try: for entry in self.parser: if self.skipping: if entry.title != self.xmlStart: continue self.skipping = False if not self.isTitleExcepted(entry.title) \ and not self.isTextExcepted(entry.text): new_text = entry.text for old, new in self.replacements: new_text = textlib.replaceExcept( new_text, old, new, self.excsInside, self.site) if new_text != entry.text: yield pywikibot.Page(self.site, entry.title) except KeyboardInterrupt: try: if not self.skipping: pywikibot.output( u'To resume, use "-xmlstart:%s" on the command line.' % entry.title) except NameError: pass
def fixArabicLetters(self, text: str) -> str: """Fix Arabic and Persian letters.""" if self.site.code not in ['ckb', 'fa']: return text exceptions = [ 'file', 'gallery', 'hyperlink', 'interwiki', 'inputbox', # FIXME: but changes letters inside wikilinks # 'link', 'math', 'pre', 'template', 'timeline', 'ref', 'startspace', 'syntaxhighlight', ] # type: List[Union[str, Pattern[str]]] digits = textlib.NON_LATIN_DIGITS faChrs = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa'] # not to let bot edits in latin content exceptions.append(re.compile('[^{fa}] *?"*? *?, *?[^{fa}]' .format(fa=faChrs))) text = textlib.replaceExcept(text, ',', '،', exceptions, site=self.site) if self.site.code == 'ckb': text = textlib.replaceExcept(text, '\u0647([.\u060c_<\\]\\s])', '\u06d5\\1', exceptions, site=self.site) text = textlib.replaceExcept(text, 'ه\u200c', 'ە', exceptions, site=self.site) text = textlib.replaceExcept(text, 'ه', 'ھ', exceptions, site=self.site) text = textlib.replaceExcept(text, 'ك', 'ک', exceptions, site=self.site) text = textlib.replaceExcept(text, '[ىي]', 'ی', exceptions, site=self.site) return text
def removeEmptySections(self, text): """Cleanup empty sections.""" exceptions = [ 'comment', 'pre', 'source', 'nowiki', 'code', 'startspace' ] skippings = ['comment'] skip_regexes = _get_regexes(skippings, self.site) skip_templates = { 'cs': ('Pahýl[ _]část', ), # stub section } if self.site.code in skip_templates: for template in skip_templates[self.site.code]: skip_regexes.append( re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I)) stripped_text = str(text) for reg in skip_regexes: stripped_text = reg.sub(r'', stripped_text) stripped_pattern = re.compile( r'\n((=+) *[^\n=]+? *\2) *\n\s*(?=(\2 *[^\n=]+? *\2))') pos = 0 while True: match = stripped_pattern.search(stripped_text[pos:]) if not match: break pattern = re.compile( r'\n{}.+?(?={})'.format(match.group(1), match.group(3)), re.DOTALL) text = textlib.replaceExcept(text, pattern, r'\n', exceptions=exceptions) pos = match.end() return text
def fixArabicLetters(self, text): """Fix arabic and persian letters.""" if self.site.code not in ['ckb', 'fa']: return text exceptions = [ 'gallery', 'file', 'hyperlink', 'interwiki', # FIXME: but changes letters inside wikilinks # 'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # FIXME: use textlib.NON_LATIN_DIGITS # valid digits digits = { 'ckb': u'٠١٢٣٤٥٦٧٨٩', 'fa': u'۰۱۲۳۴۵۶۷۸۹', } faChrs = u'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa'] new = digits.pop(self.site.code) # This only works if there are only two items in digits dict old = digits[list(digits.keys())[0]] # not to let bot edits in latin content exceptions.append( re.compile(u"[^%(fa)s] *?\"*? *?, *?[^%(fa)s]" % {'fa': faChrs})) text = textlib.replaceExcept(text, ',', '،', exceptions, site=self.site) if self.site.code == 'ckb': text = textlib.replaceExcept(text, '\u0647([.\u060c_<\\]\\s])', '\u06d5\\1', exceptions, site=self.site) text = textlib.replaceExcept(text, 'ه\u200c', 'ە', exceptions, site=self.site) text = textlib.replaceExcept(text, 'ه', 'ھ', exceptions, site=self.site) text = textlib.replaceExcept(text, 'ك', 'ک', exceptions, site=self.site) text = textlib.replaceExcept(text, '[ىي]', 'ی', exceptions, site=self.site) return text # FIXME: split this function into two. # replace persian/arabic digits # deactivated due to bug 55185 for i in range(0, 10): text = textlib.replaceExcept(text, old[i], new[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(r'\w+=(".+?"|\d+)', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') # exclude tables for now # replace digits for i in range(0, 10): text = textlib.replaceExcept(text, str(i), new[i], exceptions) return text
def fixSyntaxSave(self, text): """Convert weblinks to wikilink, fix link syntax.""" def replace_link(match): """Create a string to replace a single link.""" replacement = '[[' + match.group('link') if match.group('title'): replacement += '|' + match.group('title') return replacement + ']]' exceptions = [ 'nowiki', 'comment', 'math', 'pre', 'source', 'startspace' ] # link to the wiki working on # Only use suffixes for article paths for suffix in self.site._interwiki_urls(True): http_url = self.site.base_url(suffix, 'http') if self.site.protocol() == 'http': https_url = None else: https_url = self.site.base_url(suffix, 'https') # compare strings without the protocol, if they are empty support # also no prefix (//en.wikipedia.org/…) if https_url is not None and http_url[4:] == https_url[5:]: urls = ['(?:https?:)?' + re.escape(http_url[5:])] else: urls = [ re.escape(url) for url in (http_url, https_url) if url is not None ] for url in urls: # Only include links which don't include the separator as # the wikilink won't support additional parameters separator = '?' if '?' in suffix: separator += '&' # Match first a non space in the title to prevent that multiple # spaces at the end without title will be matched by it text = textlib.replaceExcept( text, r'\[\[?' + url + r'(?P<link>[^' + separator + r']+?)' r'(\s+(?P<title>[^\s].*?))?\s*\]\]?', replace_link, exceptions, site=self.site) # external link in/starting with double brackets text = textlib.replaceExcept(text, r'\[\[(?P<url>https?://[^\]]+?)\]\]?', r'[\g<url>]', exceptions, site=self.site) # external link and description separated by a pipe, with # whitespace in front of the pipe, so that it is clear that # the dash is not a legitimate part of the URL. text = textlib.replaceExcept( text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. extensions = [ r'\.{0}'.format(ext) for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp'] ] text = textlib.replaceExcept( text, r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *' r'\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) return text
def cleanUpLinks(self, text): """Tidy up wikilinks found in a string. This function will: * Replace underscores with spaces * Move leading and trailing spaces out of the wikilink and into the surrounding text * Convert URL-encoded characters into Unicode-encoded characters * Move trailing characters out of the link and make the link without using a pipe, if possible * Capitalize the article title of the link, if appropriate @param text: string to perform the clean-up on @type text: str @return: text with tidied wikilinks @rtype: str """ # helper function which works on one link and either returns it # unmodified, or returns a replacement. def handleOneLink(match): titleWithSection = match.group('titleWithSection') label = match.group('label') trailingChars = match.group('linktrail') newline = match.group('newline') try: is_interwiki = self.site.isInterwikiLink(titleWithSection) except ValueError: # T111513 is_interwiki = True if not is_interwiki: # The link looks like this: # [[page_title|link_text]]trailing_chars # We only work on namespace 0 because pipes and linktrails work # differently for images and categories. page = pywikibot.Page( pywikibot.Link(titleWithSection, self.site)) try: namespace = page.namespace() except pywikibot.InvalidTitle: return match.group() if namespace == 0: # Replace underlines by spaces, also multiple underlines titleWithSection = re.sub('_+', ' ', titleWithSection) # Remove double spaces titleWithSection = re.sub(' +', ' ', titleWithSection) # Remove unnecessary leading spaces from title, # but remember if we did this because we eventually want # to re-add it outside of the link later. titleLength = len(titleWithSection) titleWithSection = titleWithSection.lstrip() hadLeadingSpaces = (len(titleWithSection) != titleLength) hadTrailingSpaces = False # Remove unnecessary trailing spaces from title, # but remember if we did this because it may affect # the linktrail and because we eventually want to # re-add it outside of the link later. if not trailingChars: titleLength = len(titleWithSection) titleWithSection = titleWithSection.rstrip() hadTrailingSpaces = (len(titleWithSection) != titleLength) # Convert URL-encoded characters to unicode from pywikibot.page import url2unicode titleWithSection = url2unicode(titleWithSection, encodings=self.site) if titleWithSection == '': # just skip empty links. return match.group() # Remove unnecessary initial and final spaces from label. # Please note that some editors prefer spaces around pipes. # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway. if label is not None: # Remove unnecessary leading spaces from label, # but remember if we did this because we want # to re-add it outside of the link later. labelLength = len(label) label = label.lstrip() hadLeadingSpaces = (len(label) != labelLength) # Remove unnecessary trailing spaces from label, # but remember if we did this because it affects # the linktrail. if not trailingChars: labelLength = len(label) label = label.rstrip() hadTrailingSpaces = (len(label) != labelLength) else: label = titleWithSection if trailingChars: label += trailingChars if self.site.siteinfo['case'] == 'first-letter': firstcase_title = first_lower(titleWithSection) firstcase_label = first_lower(label) else: firstcase_title = titleWithSection firstcase_label = label if firstcase_label == firstcase_title: newLink = '[[%s]]' % label # Check if we can create a link with trailing characters # instead of a pipelink elif (firstcase_label.startswith(firstcase_title) and trailR.sub('', label[len(titleWithSection):]) == ''): newLink = '[[%s]]%s' % (label[:len(titleWithSection)], label[len(titleWithSection):]) else: # Try to capitalize the first letter of the title. # Not useful for languages that don't capitalize nouns. # TODO: Add a configuration variable for each site, # which determines if the link target is written in # uppercase if self.site.sitename == 'wikipedia:de': titleWithSection = first_upper(titleWithSection) newLink = "[[%s|%s]]" % (titleWithSection, label) # re-add spaces that were pulled out of the link. # Examples: # text[[ title ]]text -> text [[title]] text # text[[ title | name ]]text -> text [[title|name]] text # text[[ title |name]]text -> text[[title|name]]text # text[[title| name]]text -> text [[title|name]]text if hadLeadingSpaces and not newline: newLink = ' ' + newLink if hadTrailingSpaces: newLink = newLink + ' ' if newline: newLink = newline + newLink return newLink # don't change anything return match.group() trailR = re.compile(self.site.linktrail()) # The regular expression which finds links. Results consist of four groups: # group <newline> depends whether the links starts with a new line. # group <titleWithSection> is the page title and section, that is, # everything before | or ]. It'll include the # to make life easier for us. # group <label> is the alternative link title between | and ]. # group <linktrail> is the link trail after ]] which are part of the word. # note that the definition of 'letter' varies from language to language. linkR = re.compile( r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)' r'(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + self.site.linktrail() + ')') text = textlib.replaceExcept( text, linkR, handleOneLink, ['comment', 'math', 'nowiki', 'pre', 'startspace']) return text
def transferImage(self, sourceImagePage): """ Download image and its description, and upload it to another site. @return: the filename which was used to upload the image """ sourceSite = sourceImagePage.site url = sourceImagePage.fileUrl().encode('utf-8') pywikibot.output(u"URL should be: %s" % url) # localize the text that should be printed on the image description page try: description = sourceImagePage.get() # try to translate license templates if (sourceSite.sitename, self.targetSite.sitename) in licenseTemplates: for old, new in licenseTemplates[( sourceSite.sitename, self.targetSite.sitename)].items(): new = '{{%s}}' % new old = re.compile('{{%s}}' % old) description = textlib.replaceExcept( description, old, new, ['comment', 'math', 'nowiki', 'pre']) description = i18n.twtranslate(self.targetSite, 'imagetransfer-file_page_message', { 'site': sourceSite, 'description': description }) description += '\n\n' description += sourceImagePage.getFileVersionHistoryTable() # add interwiki link if sourceSite.family == self.targetSite.family: description += u'\r\n\r\n{0}'.format(sourceImagePage) except pywikibot.NoPage: description = '' pywikibot.output( 'Image does not exist or description page is empty.') except pywikibot.IsRedirectPage: description = '' pywikibot.output('Image description page is redirect.') else: bot = UploadRobot(url=url, description=description, targetSite=self.targetSite, urlEncoding=sourceSite.encoding(), keepFilename=self.keep_name, verifyDescription=not self.keep_name, ignoreWarning=self.ignore_warning) # try to upload targetFilename = bot.run() if targetFilename and self.targetSite.family.name == 'commons' and \ self.targetSite.code == 'commons': # upload to Commons was successful reason = i18n.twtranslate(sourceSite, 'imagetransfer-nowcommons_notice') # try to delete the original image if we have a sysop account if sourceSite.family.name in config.sysopnames and \ sourceSite.lang in config.sysopnames[sourceSite.family.name]: if sourceImagePage.delete(reason): return if sourceSite.lang in nowCommonsTemplate and \ sourceSite.family.name in config.usernames and \ sourceSite.lang in config.usernames[sourceSite.family.name]: # add the nowCommons template. pywikibot.output(u'Adding nowCommons template to %s' % sourceImagePage.title()) sourceImagePage.put( sourceImagePage.get() + '\n\n' + nowCommonsTemplate[sourceSite.lang] % targetFilename, summary=reason)
def check_titles(site, report_page_name, replacements): """ To avoid breaking links, adds page titles that will be changed to exception list :param site: site where the bot will run :param report_page_name: a page name to list of titles adds to exception :param replacements: dictionary of replacements """ from pywikibot import textlib from pywikibot.tools import itergroup all_pages = site.allpages(namespace=0, filterredir=False, content=False) evaluation_progress = 0 exceptions_dict = {} for titles_group in itergroup(all_pages, all_pages.query_limit): titles_group_t = [ p.title(as_link=True, with_section=False) for p in titles_group ] old_titles = titles_group_t evaluation_progress += len(titles_group_t) if evaluation_progress % 20000 == 0: print('\r%i page titles processed' % evaluation_progress) old_text = ' \n '.join(titles_group_t) for replacement_key, replacement in replacements.items(): replacement_exceptions = replacement.exceptions or {} replacement_exceptions_inside = replacement_exceptions.get( 'inside', []) new_text = textlib.replaceExcept(old_text, replacement.old_regex, replacement.new, replacement_exceptions_inside, site=site) # replacement change valid title changed_titles = ( (old_title, new_title) for old_title, new_title in zip( old_titles, new_text.split(' \n ')) if old_title != new_title and old_title != '[[%s' % pywikibot.tools.first_upper(new_title[2:])) # breaks link # no special treat for link changed_titles = ( (old_title, new_title) for old_title, new_title in changed_titles if replacement.old_regex.sub(replacement.new, ' %s ' % old_title[2:-2]) != ' %s ' % old_title[2:-2]) # valid title is not disambig changed_titles = [ old_title[2:-2] for old_title, new_title in changed_titles if not pywikibot.Page(site, old_title[2:-2]).isDisambig() ] if len(changed_titles) > 0: #changed_titles_exceptions = [re.compile(re.escape(title), re.U) for title in changed_titles] changed_titles_exceptions = [ re.compile( '\[\[%s\|.+?\]\]|%s' % (re.escape(title), re.escape(title)), re.U) for title in changed_titles ] replacement_exceptions[ 'inside'] = replacement_exceptions_inside + changed_titles_exceptions replacement.exceptions = replacement_exceptions if replacement_key not in exceptions_dict: exceptions_dict[replacement_key] = [] exceptions_dict[replacement_key] += changed_titles exceptions_dict = OrderedDict( sorted((int(k), v) for k, v in exceptions_dict.items())) report_page = pywikibot.Page(site, report_page_name) exception_report = '' for replace_key, replaced_titles in exceptions_dict.items(): exception_report += '\n* %i\n%s' % (replace_key, '\n'.join( ['** [[%s]]' % t for t in replaced_titles])) report_page.put(exception_report, summary='עדכון')
def commonsfiledesc(self, text): """ Clean up file descriptions on the Wikimedia Commons. It is working according to [1] and works only on pages in the file namespace on the Wikimedia Commons. [1]: https://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup """ if self.site.sitename != 'commons:commons' or self.namespace == 6: return # section headers to {{int:}} versions exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki', 'pre', 'source', 'ref', 'timeline'] text = textlib.replaceExcept(text, r"([\r\n]|^)\=\= *Summary *\=\=", r"\1== {{int:filedesc}} ==", exceptions, True) text = textlib.replaceExcept( text, r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=", r"\1== {{int:license-header}} ==", exceptions, True) text = textlib.replaceExcept( text, r'([\r\n])' r'\=\= *(Licensing|License information|{{int:license}}) *\=\=', r"\1== {{int:license-header}} ==", exceptions, True) # frequent field values to {{int:}} versions text = textlib.replaceExcept( text, r'([\r\n]\|[Ss]ource *\= *)' r'(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *' r'([\r\n])', r'\1{{own}}\2', exceptions, True) text = textlib.replaceExcept( text, r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])', r'\1\2', exceptions, True) # added to transwikied pages text = textlib.replaceExcept(text, r'__NOTOC__', '', exceptions, True) # tracker element for js upload form text = textlib.replaceExcept( text, r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->', '', exceptions[1:], True) text = textlib.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}', '', exceptions, True) # duplicated section headers text = textlib.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *' r'{{int:filedesc}} *\=\=', r'\1== {{int:filedesc}} ==', exceptions, True) text = textlib.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)' r'\=\= *{{int:license-header}} *\=\=', r'\1== {{int:license-header}} ==', exceptions, True) return text
def transfer_image(self, sourceImagePage): """ Download image and its description, and upload it to another site. @return: the filename which was used to upload the image """ sourceSite = sourceImagePage.site pywikibot.output( '\n>>> Transfer {source} from {source.site} to {target}\n'.format( source=sourceImagePage, target=self.opt.target)) url = sourceImagePage.get_file_url() pywikibot.output('URL should be: ' + url) # localize the text that should be printed on image description page try: description = sourceImagePage.get() # try to translate license templates if (sourceSite.sitename, self.opt.target.sitename) in licenseTemplates: for old, new in licenseTemplates[( sourceSite.sitename, self.opt.target.sitename)].items(): new = '{{%s}}' % new old = re.compile('{{%s}}' % old) description = textlib.replaceExcept( description, old, new, ['comment', 'math', 'nowiki', 'pre']) description = i18n.twtranslate(self.opt.target, 'imagetransfer-file_page_message', { 'site': sourceSite, 'description': description }) description += '\n\n' description += sourceImagePage.getFileVersionHistoryTable() # add interwiki link if sourceSite.family == self.opt.target.family: description += '\n\n{}'.format(sourceImagePage) except NoPageError: pywikibot.output( 'Image does not exist or description page is empty.') except IsRedirectPageError: pywikibot.output('Image description page is redirect.') else: bot = UploadRobot(url=url, description=description, target_site=self.opt.target, url_encoding=sourceSite.encoding(), keep_filename=self.opt.keepname, verify_description=not self.opt.keepname, ignore_warning=self.opt.ignore_warning) # try to upload if bot.skip_run(): return target_filename = bot.upload_file(url) if target_filename \ and self.opt.target.sitename == 'commons:commons': # upload to Commons was successful reason = i18n.twtranslate(sourceSite, 'imagetransfer-nowcommons_notice') # try to delete the original image if we have a sysop account if sourceSite.has_right('delete'): if sourceImagePage.delete(reason): return if sourceSite.lang in nowCommonsTemplate \ and sourceSite.family.name in config.usernames \ and sourceSite.lang in \ config.usernames[sourceSite.family.name]: # add the nowCommons template. pywikibot.output('Adding nowCommons template to ' + sourceImagePage.title()) sourceImagePage.put( sourceImagePage.get() + '\n\n' + nowCommonsTemplate[sourceSite.lang] % target_filename, summary=reason)
def process(self, text): """Process the page.""" # keys are ref groups # values are a dict where : # keys are ref content # values are [name, [list of full ref matches], # quoted, need_to_change] found_refs = {} found_ref_names = {} # Replace key by [value, quoted] named_repl = {} for match in self.REFS.finditer(text): content = match.group('content') if not content.strip(): continue params = match.group('params') group = self.GROUPS.match(params) if group not in found_refs: found_refs[group] = {} groupdict = found_refs[group] if content in groupdict: v = groupdict[content] v[1].append(match.group()) else: v = [None, [match.group()], False, False] found = self.NAMES.match(params) if found: quoted = found.group('quote') in ['"', "'"] name = found.group('name') if v[0]: if v[0] != name: named_repl[name] = [v[0], v[2]] else: # First name associated with this content if name == 'population': pywikibot.output(content) if name not in found_ref_names: # first time ever we meet this name if name == 'population': pywikibot.output('in') v[2] = quoted v[0] = name else: # if has_key, means that this name is used # with another content. We'll need to change it v[3] = True found_ref_names[name] = 1 groupdict[content] = v used_numbers = set() for name in found_ref_names: number = removeprefix(name, self.autogen) with suppress(ValueError): used_numbers.add(int(number)) # iterator to give the next free number free_number = iter({str(i) for i in range(1, 1000) # should be enough if i not in used_numbers}) for (g, d) in found_refs.items(): group = '' if g: group = 'group="{}" '.format(group) for (k, v) in d.items(): if len(v[1]) == 1 and not v[3]: continue name = v[0] if not name: name = '"{}{}"'.format(self.autogen, next(free_number)) elif v[2]: name = '"{}"'.format(name) named = '<ref {}name={}>{}</ref>'.format(group, name, k) text = text.replace(v[1][0], named, 1) # make sure that the first (named ref) is not # removed later : pos = text.index(named) + len(named) header = text[:pos] end = text[pos:] unnamed = '<ref {}name={} />'.format(group, name) for ref in v[1][1:]: # Don't replace inside templates (T266411) end = replaceExcept(end, re.escape(ref), unnamed, exceptions=['template']) text = header + end for (k, v) in named_repl.items(): # TODO : Support ref groups name = v[0] if v[1]: name = '"{}"'.format(name) text = re.sub( r'<ref name\s*=\s*(?P<quote>["\']?)\s*{}\s*(?P=quote)\s*/>' .format(k), '<ref name={} />'.format(name), text) return text