def fixSyntaxSave(self, text): exceptions = [ 'nowiki', 'comment', 'math', 'pre', 'source', 'startspace' ] # external link in double brackets text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions) # external link starting with double bracket text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]', r'[\g<url>]', exceptions) # external link and description separated by a dash, with # whitespace in front of the dash, so that it is clear that # the dash is not a legitimate part of the URL. text = pywikibot.replaceExcept( text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. text = pywikibot.replaceExcept( text, r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) return text
def fixSyntaxSave(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # link to the wiki working on ## TODO: disable this for difflinks and titled links ## http://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&diff=103109563&oldid=103109271 ## text = pywikibot.replaceExcept(text, ## r'\[https?://%s\.%s\.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?\]' ## % (self.site.lang, self.site.family.name), ## r'[[\g<link>|\g<title>]]', exceptions) # external link in double brackets text = pywikibot.replaceExcept( text, r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions) # external link starting with double bracket text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]', r'[\g<url>]', exceptions) # external link and description separated by a dash, with # whitespace in front of the dash, so that it is clear that # the dash is not a legitimate part of the URL. text = pywikibot.replaceExcept( text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. text = pywikibot.replaceExcept( text, r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) return text
def removeUselessSpaces(self, text): multipleSpacesR = re.compile(' +') spaceAtLineEndR = re.compile(' $') exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table', 'template'] text = pywikibot.replaceExcept(text, multipleSpacesR, ' ', exceptions) text = pywikibot.replaceExcept(text, spaceAtLineEndR, '', exceptions) return text
def fixArabicLetters(self, text): if self.site.lang=='ckb' or self.site.lang=='fa': exceptions = [ 'gallery', 'hyperlink', 'interwiki', # but changes letters inside wikilinks #'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # do not change inside file links namespaces = list(self.site.namespace(6, all = True)) pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + '):.+?\..+?\]\]', re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u',', u'،', exceptions) if self.site.lang=='ckb': text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) # replace persian digits for i in range(0,10): if self.site.lang=='ckb': text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i], u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) else: text = pywikibot.replaceExcept(text, u'٠١٢٣٤٥٦٧٨٩'[i], u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'=".*?"', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') #exclude tables for now for i in range(0,10): if self.site.lang=='ckb': text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) else: text = pywikibot.replaceExcept(text, str(i), u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions) return text
def commonsfiledesc(self, text): # section headers to {{int:}} versions exceptions = ["comment", "includeonly", "math", "noinclude", "nowiki", "pre", "source", "ref", "timeline"] text = pywikibot.replaceExcept( text, r"([\r\n]|^)\=\= *Summary *\=\=", r"\1== {{int:filedesc}} ==", exceptions, True ) text = pywikibot.replaceExcept( text, r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=", r"\1== {{int:license}} ==", exceptions, True, ) text = pywikibot.replaceExcept( text, r"([\r\n])\=\= *(Licensing|License information|{{int:license-header}}) *\=\=", r"\1== {{int:license}} ==", exceptions, True, ) # frequent field values to {{int:}} versions text = pywikibot.replaceExcept( text, r"([\r\n]\|[Ss]ource *\= *)(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *([\r\n])", r"\1{{own}}\2", exceptions, True, ) text = pywikibot.replaceExcept( text, r"(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])", r"\1\2", exceptions, True ) # added to transwikied pages text = pywikibot.replaceExcept(text, r"__NOTOC__", "", exceptions, True) # tracker element for js upload form text = pywikibot.replaceExcept(text, r"<!-- *{{ImageUpload\|(?:full|basic)}} *-->", "", exceptions[1:], True) text = pywikibot.replaceExcept(text, r"{{ImageUpload\|(?:basic|full)}}", "", exceptions, True) # duplicated section headers text = pywikibot.replaceExcept( text, r"([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *{{int:filedesc}} *\=\=", r"\1== {{int:filedesc}} ==", exceptions, True, ) text = pywikibot.replaceExcept( text, r"([\r\n]|^)\=\= *{{int:license}} *\=\=(?:[\r\n ]*)\=\= *{{int:license}} *\=\=", r"\1== {{int:license}} ==", exceptions, True, ) return text
def removeUselessSpaces(self, text): result = [] multipleSpacesR = re.compile(" +") spaceAtLineEndR = re.compile(" $") exceptions = ["comment", "math", "nowiki", "pre", "startspace", "table", "template"] text = pywikibot.replaceExcept(text, multipleSpacesR, " ", exceptions) text = pywikibot.replaceExcept(text, spaceAtLineEndR, "", exceptions) return text
def fixReferences(self, text): #http://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # it should be name = " or name=" NOT name =" text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text) #remove empty <ref/>-tag text = pywikibot.replaceExcept(text, r'(?i)(<ref\s*/>|<ref *>\s*</ref>)', r'', exceptions) text = pywikibot.replaceExcept(text, r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>', r'<ref \1/>', exceptions) return text
def fixTypo(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace', 'gallery', 'hyperlink', 'interwiki', 'link'] # change <number> ccm -> <number> cm³ text = pywikibot.replaceExcept(text, ur'(\d)\s* ccm', ur'\1 cm³', exceptions) text = pywikibot.replaceExcept(text, ur'(\d)\s*ccm', ur'\1 cm³', exceptions) # Solve wrong Nº sign with °C or °F # additional exception requested on fr-wiki for this stuff pattern = re.compile(u'«.*?»', re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, ur'(\d)\s* [º°]([CF])', ur'\1 °\2', exceptions) text = pywikibot.replaceExcept(text, ur'(\d)\s*[º°]([CF])', ur'\1 °\2', exceptions) text = pywikibot.replaceExcept(text, ur'º([CF])', ur'°\1', exceptions) return text
def fixHtml(self, text): # Everything case-insensitive (?i) # Keep in mind that MediaWiki automatically converts <br> to <br /> exceptions = [ 'nowiki', 'comment', 'math', 'pre', 'source', 'startspace' ] text = pywikibot.replaceExcept(text, r'(?i)<b>(.*?)</b>', r"'''\1'''", exceptions) text = pywikibot.replaceExcept(text, r'(?i)<strong>(.*?)</strong>', r"'''\1'''", exceptions) text = pywikibot.replaceExcept(text, r'(?i)<i>(.*?)</i>', r"''\1''", exceptions) text = pywikibot.replaceExcept(text, r'(?i)<em>(.*?)</em>', r"''\1''", exceptions) # horizontal line without attributes in a single line text = pywikibot.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2', exceptions) # horizontal line with attributes; can't be done with wiki syntax # so we only make it XHTML compliant text = pywikibot.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 />', exceptions) # a header where only spaces are in the same line for level in range(1, 7): equals = '\\1%s \\2 %s\\3' % ("=" * level, "=" * level) text = pywikibot.replaceExcept( text, r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])' % (level, level), r'%s' % equals, exceptions) #remove empty <ref/>-tag text = pywikibot.replaceExcept(text, r'(?i)<ref\s*/>', r'', exceptions) # TODO: maybe we can make the bot replace <p> tags with \r\n's. return text
def commonsfiledesc(self, text): # section headers to {{int:}} versions exceptions = [ 'comment', 'includeonly', 'math', 'noinclude', 'nowiki', 'pre', 'source', 'ref', 'timeline' ] text = pywikibot.replaceExcept(text, r"([\r\n]|^)\=\= *Summary *\=\=", r"\1== {{int:filedesc}} ==", exceptions, True) text = pywikibot.replaceExcept( text, r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=", r"\1== {{int:license}} ==", exceptions, True) text = pywikibot.replaceExcept( text, r"([\r\n])\=\= *(Licensing|License information|{{int:license-header}}) *\=\=", r"\1== {{int:license}} ==", exceptions, True) # frequent field values to {{int:}} versions text = pywikibot.replaceExcept( text, r'([\r\n]\|[Ss]ource *\= *)(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *([\r\n])', r'\1{{own}}\2', exceptions, True) text = pywikibot.replaceExcept( text, r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])', r'\1\2', exceptions, True) # added to transwikied pages text = pywikibot.replaceExcept(text, r'__NOTOC__', '', exceptions, True) # tracker element for js upload form text = pywikibot.replaceExcept( text, r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->', '', exceptions[1:], True) text = pywikibot.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}', '', exceptions, True) # duplicated section headers text = pywikibot.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *{{int:filedesc}} *\=\=', r'\1== {{int:filedesc}} ==', exceptions, True) text = pywikibot.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:license}} *\=\=(?:[\r\n ]*)\=\= *{{int:license}} *\=\=', r'\1== {{int:license}} ==', exceptions, True) return text
def commonsfiledesc(self, text): # section headers to {{int:}} versions exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki', 'pre', 'source', 'ref', 'timeline'] text = pywikibot.replaceExcept(text, r"([\r\n]|^)\=\= *Summary *\=\=", r"\1== {{int:filedesc}} ==", exceptions, True) text = pywikibot.replaceExcept( text, r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=", r"\1== {{int:license-header}} ==", exceptions, True) text = pywikibot.replaceExcept( text, r"([\r\n])\=\= *(Licensing|License information|{{int:license}}) *\=\=", r"\1== {{int:license-header}} ==", exceptions, True) # frequent field values to {{int:}} versions text = pywikibot.replaceExcept( text, r'([\r\n]\|[Ss]ource *\= *)(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *([\r\n])', r'\1{{own}}\2', exceptions, True) text = pywikibot.replaceExcept( text, r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])', r'\1\2', exceptions, True) # added to transwikied pages text = pywikibot.replaceExcept(text, r'__NOTOC__', '', exceptions, True) # tracker element for js upload form text = pywikibot.replaceExcept( text, r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->', '', exceptions[1:], True) text = pywikibot.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}', '', exceptions, True) # duplicated section headers text = pywikibot.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *{{int:filedesc}} *\=\=', r'\1== {{int:filedesc}} ==', exceptions, True) text = pywikibot.replaceExcept( text, r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)\=\= *{{int:license-header}} *\=\=', r'\1== {{int:license-header}} ==', exceptions, True) return text
def fixSyntaxSave(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # external link in double brackets text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://[^\]]+?)\]\]', r'[\g<url>]', exceptions) # external link starting with double bracket text = pywikibot.replaceExcept(text, r'\[\[(?P<url>https?://.+?)\]', r'[\g<url>]', exceptions) # external link and description separated by a dash, with # whitespace in front of the dash, so that it is clear that # the dash is not a legitimate part of the URL. text = pywikibot.replaceExcept(text, r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) # dash in external link, where the correct end of the URL can # be detected from the file extension. It is very unlikely that # this will cause mistakes. text = pywikibot.replaceExcept(text, r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions) return text
def sort_by_country_subcat(subcat, subject): print subcat subcat = subcat.replace("_", " ") subject = subject.replace("_", " ") if subcat.startswith(subject): temp1 = subcat[len(subject) :].lstrip() if temp1.startswith("from"): temp2 = temp1[len("from") :].lstrip() elif temp1.startswith("of"): temp2 = temp1[len("of") :].lstrip() elif temp1.startswith("in"): temp2 = temp1[len("in") :].lstrip() else: temp2 = "" if temp2: if temp2.startswith("the"): country = temp2[len("the") :].lstrip() else: country = temp2 page = wikipedia.Page(wikipedia.getSite(), "Category:" + subcat) old = u"\[\[[cC]ategory:" + subject + u" by country[^\]]*\]\]" new = u"[[Category:" + subject + u" by country|" + country + u"]]" comment = u"Sorting [[:Category:" + subject + u" by country]]" newtext = wikipedia.replaceExcept(page.get(), old, new, []) wikipedia.showDiff(page.get(), newtext) page.put(newtext, comment)
def translateMagicWords(self, text): """ Makes sure that localized namespace names are used. """ # not wanted at ru # arz uses english stylish codes if self.site.lang not in ['arz', 'ru']: exceptions = ['nowiki', 'comment', 'math', 'pre'] for magicWord in [ 'img_thumbnail', 'img_left', 'img_center', 'img_right', 'img_none', 'img_framed', 'img_frameless', 'img_border', 'img_upright', ]: aliases = self.site.siteinfo('magicwords').get(magicWord) if not aliases: continue text = pywikibot.replaceExcept( text, r'\[\[(?P<left>.+?:.+?\..+?\|) *(' + '|'.join(aliases) + ') *(?P<right>(\|.*?)?\]\])', r'[[\g<left>' + aliases[0] + '\g<right>', exceptions) return text
def removeDeprecatedTemplates(self, text): if deprecatedTemplates.has_key(self.site.family.name) and deprecatedTemplates[self.site.family.name].has_key(self.site.lang): for template in deprecatedTemplates[self.site.family.name][self.site.lang]: if not self.site.nocapitalize: template = '[' + template[0].upper() + template[0].lower() + ']' + template[1:] text = wikipedia.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' + template + '(?P<parameters>\|[^}]+|)}}', '', ['comment', 'math', 'nowiki', 'pre']) return text
def sort_by_country_subcat(subcat, subject): print subcat subcat = subcat.replace('_', ' ') subject = subject.replace('_', ' ') if subcat.startswith(subject): temp1 = subcat[len(subject):].lstrip() if temp1.startswith('from'): temp2 = temp1[len('from'):].lstrip() elif temp1.startswith('of'): temp2 = temp1[len('of'):].lstrip() elif temp1.startswith('in'): temp2 = temp1[len('in'):].lstrip() else: temp2 = '' if temp2: if temp2.startswith('the'): country = temp2[len('the'):].lstrip() else: country = temp2 page = wikipedia.Page(wikipedia.getSite(), 'Category:' + subcat) old = u'\[\[[cC]ategory:' + subject + u' by country[^\]]*\]\]' new = u'[[Category:' + subject + u' by country|' + country + u']]' comment = u'Sorting [[:Category:' + subject + u' by country]]' newtext = wikipedia.replaceExcept(page.get(), old, new, []) wikipedia.showDiff(page.get(), newtext) page.put(newtext, comment)
def translateAndCapitalizeNamespaces(self, text): """ Makes sure that localized namespace names are used. """ # arz uses english stylish codes if self.site.sitename() == 'wikipedia:arz': return text family = self.site.family # wiki links aren't parsed here. exceptions = ['nowiki', 'comment', 'math', 'pre'] for nsNumber in family.namespaces: if not family.isDefinedNSLanguage(nsNumber, self.site.lang): # Skip undefined namespaces continue namespaces = list(self.site.namespace(nsNumber, all = True)) thisNs = namespaces.pop(0) if nsNumber == 6 and family.name == 'wikipedia' and \ self.site.lang in ('en', 'fr'): # do not change "Image" on en-wiki and fr-wiki for image in [u'Image', u'image']: if image in namespaces: namespaces.remove(image) # skip main (article) namespace if thisNs and namespaces: text = pywikibot.replaceExcept(text, r'\[\[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions) return text
def translateAndCapitalizeNamespaces(self, text): """ Makes sure that localized namespace names are used. """ # arz uses english stylish codes if self.site.sitename() == 'wikipedia:arz': return text family = self.site.family # wiki links aren't parsed here. exceptions = ['nowiki', 'comment', 'math', 'pre'] for nsNumber in family.namespaces: if not family.isDefinedNSLanguage(nsNumber, self.site.lang): # Skip undefined namespaces continue namespaces = list(self.site.namespace(nsNumber, all=True)) thisNs = namespaces.pop(0) if nsNumber == 6 and family.name == 'wikipedia' and \ self.site.lang in ('en', 'fr'): # do not change "Image" on en-wiki and fr-wiki for image in [u'Image', u'image']: if image in namespaces: namespaces.remove(image) # skip main (article) namespace if thisNs and namespaces: text = pywikibot.replaceExcept( text, r'\[\[\s*(' + '|'.join(namespaces) + ') *:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]', exceptions) return text
def translateAndCapitalizeNamespaces(self, text): """ Makes sure that localized namespace names are used. """ # arz uses english stylish codes if self.site.sitename() == "wikipedia:arz": return text family = self.site.family # wiki links aren't parsed here. exceptions = ["nowiki", "comment", "math", "pre"] for nsNumber in family.namespaces: if not family.isDefinedNSLanguage(nsNumber, self.site.lang): # Skip undefined namespaces continue namespaces = list(self.site.namespace(nsNumber, all=True)) thisNs = namespaces.pop(0) if nsNumber == 6 and family.name == "wikipedia" and self.site.lang in ("en", "fr"): # do not change "Image" on en-wiki and fr-wiki for image in [u"Image", u"image"]: if image in namespaces: namespaces.remove(image) # skip main (article) namespace if thisNs and namespaces: text = pywikibot.replaceExcept( text, r"\[\[\s*(" + "|".join(namespaces) + ") *:(?P<nameAndLabel>.*?)\]\]", r"[[" + thisNs + ":\g<nameAndLabel>]]", exceptions, ) return text
def fixArabicLetters(self, text): if self.site.lang == "ckb" or self.site.lang == "fa": exceptions = [ "gallery", "hyperlink", "interwiki", # but changes letters inside wikilinks #'link', "math", "pre", "template", "timeline", "ref", "source", "startspace", "inputbox", ] # do not change inside file links namespaces = list(self.site.namespace(6, all=True)) pattern = re.compile(u"\[\[(" + "|".join(namespaces) + "):.+?\..+?\]\]", re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u",", u"،", exceptions) if self.site.lang == "ckb": text = pywikibot.replaceExcept(text, ur"ه([.،_<\]\s])", ur"ە\1", exceptions) text = pywikibot.replaceExcept(text, u"ه", u"ە", exceptions) text = pywikibot.replaceExcept(text, u"ه", u"ھ", exceptions) text = pywikibot.replaceExcept(text, u"ك", u"ک", exceptions) text = pywikibot.replaceExcept(text, ur"[ىي]", u"ی", exceptions) # replace persian digits for i in range(0, 10): if self.site.lang == "ckb": text = pywikibot.replaceExcept(text, u"۰۱۲۳۴۵۶۷۸۹"[i], u"٠١٢٣٤٥٦٧٨٩"[i], exceptions) else: text = pywikibot.replaceExcept(text, u"٠١٢٣٤٥٦٧٨٩"[i], u"۰۱۲۳۴۵۶۷۸۹"[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'=".*?"', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u"<[/]*?[^</]+?[/]*?>", re.UNICODE) exceptions.append(pattern) exceptions.append("table") # exclude tables for now for i in range(0, 10): if self.site.lang == "ckb": text = pywikibot.replaceExcept(text, str(i), u"٠١٢٣٤٥٦٧٨٩"[i], exceptions) else: text = pywikibot.replaceExcept(text, str(i), u"۰۱۲۳۴۵۶۷۸۹"[i], exceptions) return text
def fixArabicLetters(self, text): exceptions = [ 'gallery', 'hyperlink', 'interwiki', # but changes letters inside wikilinks #'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # valid digits digits = { 'ckb' : u'٠١٢٣٤٥٦٧٨٩', 'fa' : u'۰۱۲۳۴۵۶۷۸۹' } new = digits.pop(self.site.lang) # This only works if there are only two items in digits dict old = digits[digits.keys()[0]] # do not change inside file links namespaces = list(self.site.namespace(6, all=True)) pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + '):.+?\.\w+? *(\|((\[\[.*?\]\])|.)*)?\]\]', re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u',', u'،', exceptions) if self.site.lang=='ckb': text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) # replace persian/arabic digits for i in xrange(0, 10): text = pywikibot.replaceExcept(text, old[i], new[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') #exclude tables for now # replace digits for i in xrange(0, 10): text = pywikibot.replaceExcept(text, str(i), new[i], exceptions) ##fixing pipe and trailing for fa. Thanks ZxxZxxZ if self.site.lang=='fa': faChrs = u'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + u'ًٌٍَُِّْٓٔ' text = re.sub(u'\[\[([^\]\|]*)]]([%s]+)' % faChrs, ur'[[\1|\1\2]]', text) text = re.sub(u'\[\[([^\]\|]*)\|(.+?)]]([%s]+)' % faChrs, ur'[[\1|\2\3]]', text) return text
def fixArabicLetters(self, text): if self.site.lang == 'ckb' or self.site.lang == 'fa': exceptions = [ 'gallery', 'hyperlink', 'interwiki', # but changes letters inside wikilinks #'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # do not change inside file links namespaces = list(self.site.namespace(6, all=True)) pattern = re.compile( u'\[\[(' + '|'.join(namespaces) + '):.+?\..+?\]\]', re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u',', u'،', exceptions) if self.site.lang == 'ckb': text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) # replace persian digits for i in range(0, 10): if self.site.lang == 'ckb': text = pywikibot.replaceExcept(text, u'۰۱۲۳۴۵۶۷۸۹'[i], u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) else: text = pywikibot.replaceExcept(text, u'٠١٢٣٤٥٦٧٨٩'[i], u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'=".*?"', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') #exclude tables for now for i in range(0, 10): if self.site.lang == 'ckb': text = pywikibot.replaceExcept(text, str(i), u'٠١٢٣٤٥٦٧٨٩'[i], exceptions) else: text = pywikibot.replaceExcept(text, str(i), u'۰۱۲۳۴۵۶۷۸۹'[i], exceptions) return text
def fixStyle(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # convert prettytable to wikitable class if self.site.language in ('de', 'en'): text = pywikibot.replaceExcept(text, ur'(class="[^"]*)prettytable([^"]*")', ur'\1wikitable\2', exceptions) return text
def markActiveTables(self, text): """ Marks all table start and end tags that are not disabled by nowiki tags, comments etc. We will then later only work on these marked tags. """ tableStartTagR = re.compile("<table", re.IGNORECASE) tableEndTagR = re.compile("</table>", re.IGNORECASE) text = wikipedia.replaceExcept( text, tableStartTagR, "<##table##", exceptions=["comment", "math", "nowiki", "pre", "source"] ) text = wikipedia.replaceExcept( text, tableEndTagR, "</##table##>", exceptions=["comment", "math", "nowiki", "pre", "source"] ) return text
def markActiveTables(self, text): """ Marks all table start and end tags that are not disabled by nowiki tags, comments etc. We will then later only work on these marked tags. """ tableStartTagR = re.compile("<table", re.IGNORECASE) tableEndTagR = re.compile("</table>", re.IGNORECASE) text = pywikibot.replaceExcept(text, tableStartTagR, "<##table##", exceptions=['comment', 'math', 'nowiki', 'pre', 'source']) text = pywikibot.replaceExcept(text, tableEndTagR, "</##table##>", exceptions=['comment', 'math', 'nowiki', 'pre', 'source']) return text
def removeNonBreakingSpaceBeforePercent(self, text): ''' Newer MediaWiki versions automatically place a non-breaking space in front of a percent sign, so it is no longer required to place it manually. ''' text = pywikibot.replaceExcept(text, r'(\d) %', r'\1 %', ['timeline']) return text
def removeNonBreakingSpaceBeforePercent(self, text): """ Newer MediaWiki versions automatically place a non-breaking space in front of a percent sign, so it is no longer required to place it manually. """ text = pywikibot.replaceExcept(text, r"(\d) %", r"\1 %", ["timeline"]) return text
def transferImage(self, sourceImagePage, debug=False): """Gets a wikilink to an image, downloads it and its description, and uploads it to another wikipedia. Returns the filename which was used to upload the image This function is used by imagetransfer.py and by copy_table.py """ sourceSite = sourceImagePage.site() if debug: print "-" * 50 if debug: print "Found image: %s"% imageTitle url = sourceImagePage.fileUrl() newname = sourceImagePage.titleWithoutNamespace() pywikibot.output(u"URL should be: %s" % url) # localize the text that should be printed on the image description page try: description = sourceImagePage.get() #unlink categories #description = pywikibot.removeCategoryLinks(description,pywikibot.getSite('commons', 'commons')) description = re.sub(u'\[\[Category', u'[[:Category', description, flags=re.IGNORECASE) # try to translate license templates if (sourceSite.sitename(), self.targetSite.sitename()) in licenseTemplates: for old, new in licenseTemplates[(sourceSite.sitename(), self.targetSite.sitename())].iteritems(): new = '{{%s}}' % new old = re.compile('{{%s}}' % old) description = pywikibot.replaceExcept(description, old, new, ['comment', 'math', 'nowiki', 'pre']) description = pywikibot.translate(self.targetSite, copy_message) \ % (sourceSite, description) description += '\n\n' + sourceImagePage.getFileVersionHistoryTable() # add interwiki link if sourceSite.family == self.targetSite.family: description += "\r\n\r\n" + sourceImagePage.aslink(forceInterwiki = True) #add cat description += "\n[[Kategooria:Commonsist kopeeritud pildid]]\n" except pywikibot.NoPage: description='' print "Image does not exist or description page is empty." except pywikibot.IsRedirectPage: description='' print "Image description page is redirect." else: #bot = UploadRobot(url=self.imagePage.fileUrl(), description=CH, useFilename=self.newname, keepFilename=True, verifyDescription=False, ignoreWarning = True, targetSite = pywikibot.getSite('commons', 'commons')) bot = upload.UploadRobot(url = url, description = description, useFilename = newname, keepFilename=True, verifyDescription=False, ignoreWarning = False, targetSite = self.targetSite) # try to upload targetFilename = bot.run() if targetFilename and self.targetSite.family.name == 'commons' and self.targetSite.lang == 'commons': # upload to Commons was successful reason = pywikibot.translate(sourceSite, nowCommonsMessage) # try to delete the original image if we have a sysop account if sourceSite.family.name in config.sysopnames and sourceSite.lang in config.sysopnames[sourceSite.family.name]: if sourceImagePage.delete(reason): return if sourceSite.lang in nowCommonsTemplate and sourceSite.family.name in config.usernames and sourceSite.lang in config.usernames[sourceSite.family.name]: # add the nowCommons template. pywikibot.output(u'Adding nowCommons template to %s' % sourceImagePage.title()) sourceImagePage.put(sourceImagePage.get() + '\n\n' + nowCommonsTemplate[sourceSite.lang] % targetFilename, comment = nowCommonsMessage[sourceSite.lang])
def fixHtml(self, text): # Everything case-insensitive (?i) # Keep in mind that MediaWiki automatically converts <br> to <br /> exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] text = pywikibot.replaceExcept(text, r'(?i)<b>(.*?)</b>', r"'''\1'''", exceptions) text = pywikibot.replaceExcept(text, r'(?i)<strong>(.*?)</strong>', r"'''\1'''", exceptions) text = pywikibot.replaceExcept(text, r'(?i)<i>(.*?)</i>', r"''\1''", exceptions) text = pywikibot.replaceExcept(text, r'(?i)<em>(.*?)</em>', r"''\1''", exceptions) # horizontal line without attributes in a single line text = pywikibot.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])', r'\1----\2', exceptions) # horizontal line with attributes; can't be done with wiki syntax # so we only make it XHTML compliant text = pywikibot.replaceExcept(text, r'(?i)<hr ([^>/]+?)>', r'<hr \1 />', exceptions) # a header where only spaces are in the same line for level in range(1, 7): equals = '\\1%s \\2 %s\\3' % ("=" * level, "=" * level) text = pywikibot.replaceExcept( text, r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])' % (level, level), r'%s' % equals, exceptions) # TODO: maybe we can make the bot replace <p> tags with \r\n's. return text
def fixArabicLetters(self, text): exceptions = [ 'gallery', 'hyperlink', 'interwiki', # but changes letters inside wikilinks #'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # valid digits digits = { 'ckb': u'٠١٢٣٤٥٦٧٨٩', 'fa': u'۰۱۲۳۴۵۶۷۸۹', } faChrs = u'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa'] new = digits.pop(self.site.lang) # This only works if there are only two items in digits dict old = digits[digits.keys()[0]] # do not change inside file links namespaces = list(self.site.namespace(6, all=True)) pattern = re.compile( u'\[\[(' + '|'.join(namespaces) + '):.+?\.\w+? *(\|((\[\[.*?\]\])|.)*)?\]\]', re.UNICODE) #not to let bot edits in latin content exceptions.append(re.compile(u"[^%(fa)s] *?\"*? *?, *?[^%(fa)s]" % {'fa': faChrs})) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u',', u'،', exceptions) if self.site.lang == 'ckb': text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) text = pywikibot.replaceExcept(text, u'[ىي]', u'ی', exceptions) return text # replace persian/arabic digits ## deactivated due to bug #3539407 for i in xrange(0, 10): text = pywikibot.replaceExcept(text, old[i], new[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') # exclude tables for now # replace digits for i in xrange(0, 10): text = pywikibot.replaceExcept(text, str(i), new[i], exceptions) return text
def categoriesChecked(category): page = wikipedia.Page(wikipedia.getSite(), category) if (page.exists()): old = u'\{\{UncategorizedHeader([^\}]*)\}\}' new = u'{{UncategorizedHeader\\1|galleries=~~~~}}' newtext = wikipedia.replaceExcept(page.get(), old, new, []) comment = u'No more images in galleries' wikipedia.showDiff(page.get(), newtext) page.put(newtext, comment)
def fixStyle(self, text): exceptions = ['nowiki', 'comment', 'math', 'pre', 'source', 'startspace'] # convert prettytable to wikitable class if self.site.language in ('de', 'en'): text = pywikibot.replaceExcept(text, r'(class="[^"]*)prettytable([^"]*")', r'\1wikitable\2', exceptions) return text
def fixArabicLetters(self, text): exceptions = [ 'gallery', 'hyperlink', 'interwiki', # but changes letters inside wikilinks #'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # valid digits digits = { 'ckb' : u'٠١٢٣٤٥٦٧٨٩', 'fa' : u'۰۱۲۳۴۵۶۷۸۹' } new = digits.pop(self.site.lang) # This only works if there are only two items in digits dict old = digits[digits.keys()[0]] # do not change inside file links namespaces = list(self.site.namespace(6, all = True)) pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + '):.+?\..+?\]\]', re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u',', u'،', exceptions) if self.site.lang=='ckb': text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) # replace persian digits for i in range(0,10): text = pywikibot.replaceExcept(text, old[i], new[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') #exclude tables for now for i in range(0,10): text = pywikibot.replaceExcept(text, str(i), new[i], exceptions) return text
def putSpacesInLists(self, text): """ For better readability of bullet list and enumeration wiki source code, puts a space between the * or # and the text. NOTE: This space is recommended in the syntax help on the English, German, and French Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ # FIXME: This breaks redirects. text = wikipedia.replaceExcept(text, r'(?m)^(?P<bullet>(\*+|#+):*)(?P<char>[^\s\*#:].+?)', '\g<bullet> \g<char>', ['comment', 'math', 'nowiki', 'pre']) return text
def putSpacesInLists(self, text): """ For better readability of bullet list and enumeration wiki source code, puts a space between the * or # and the text. NOTE: This space is recommended in the syntax help on the English, German, and French Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'timeline'] if not self.redirect and pywikibot.calledModuleName() <> 'capitalize_redirects': text = pywikibot.replaceExcept(text, r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> \g<char>', exceptions) return text
def transferImage(self, sourceImagePage, debug=False): """Gets a wikilink to an image, downloads it and its description, and uploads it to another wikipedia. Returns the filename which was used to upload the image This function is used by imagetransfer.py and by copy_table.py """ sourceSite = sourceImagePage.site() if debug: print "-" * 50 if debug: print "Found image: %s"% imageTitle url = sourceImagePage.fileUrl().encode('utf-8') pywikibot.output(u"URL should be: %s" % url) # localize the text that should be printed on the image description page try: description = sourceImagePage.get() # try to translate license templates if (sourceSite.sitename(), self.targetSite.sitename()) in licenseTemplates: for old, new in licenseTemplates[(sourceSite.sitename(), self.targetSite.sitename())].iteritems(): new = '{{%s}}' % new old = re.compile('{{%s}}' % old) description = pywikibot.replaceExcept(description, old, new, ['comment', 'math', 'nowiki', 'pre']) description = pywikibot.translate(self.targetSite, copy_message) \ % (sourceSite, description) description += '\n\n' + sourceImagePage.getFileVersionHistoryTable() # add interwiki link if sourceSite.family == self.targetSite.family: description += "\r\n\r\n" + sourceImagePage.aslink(forceInterwiki = True) except pywikibot.NoPage: description='' print "Image does not exist or description page is empty." except pywikibot.IsRedirectPage: description='' print "Image description page is redirect." else: bot = upload.UploadRobot(url = url, description = description, targetSite = self.targetSite, urlEncoding = sourceSite.encoding()) # try to upload targetFilename = bot.run() if targetFilename and self.targetSite.family.name == 'commons' and self.targetSite.lang == 'commons': # upload to Commons was successful reason = pywikibot.translate(sourceSite, nowCommonsMessage) # try to delete the original image if we have a sysop account if sourceSite.family.name in config.sysopnames and sourceSite.lang in config.sysopnames[sourceSite.family.name]: if sourceImagePage.delete(reason): return if sourceSite.lang in nowCommonsTemplate and sourceSite.family.name in config.usernames and sourceSite.lang in config.usernames[sourceSite.family.name]: # add the nowCommons template. pywikibot.output(u'Adding nowCommons template to %s' % sourceImagePage.title()) sourceImagePage.put(sourceImagePage.get() + '\n\n' + nowCommonsTemplate[sourceSite.lang] % targetFilename, comment = nowCommonsMessage[sourceSite.lang])
def cleanUpSectionHeaders(self, text): """ For better readability of section header source code, puts a space between the equal signs and the title. Example: ==Section title== becomes == Section title == NOTE: This space is recommended in the syntax help on the English and German Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ for level in range(1, 7): equals = '=' * level text = pywikibot.replaceExcept(text, r'\n' + equals + ' *(?P<title>[^=]+?) *' + equals + ' *\r\n', '\n' + equals + ' \g<title> ' + equals + '\r\n', ['comment', 'math', 'nowiki', 'pre']) return text
def replaceDeprecatedTemplates(self, text): exceptions = ['comment', 'math', 'nowiki', 'pre'] if self.site.family.name in deprecatedTemplates and self.site.lang in deprecatedTemplates[self.site.family.name]: for template in deprecatedTemplates[self.site.family.name][self.site.lang]: old = template[0] new = template[1] if new == None: new = '' else: new = '{{'+new+'}}' if not self.site.nocapitalize: old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:] text = pywikibot.replaceExcept(text, r'\{\{([mM][sS][gG]:)?' + old + '(?P<parameters>\|[^}]+|)}}', new, exceptions) return text
def fixArabicLetters(self, text): exceptions = [ 'gallery', 'hyperlink', 'interwiki', # but changes letters inside wikilinks #'link', 'math', 'pre', 'template', 'timeline', 'ref', 'source', 'startspace', 'inputbox', ] # valid digits digits = { 'ckb' : u'٠١٢٣٤٥٦٧٨٩', 'fa' : u'۰۱۲۳۴۵۶۷۸۹' } new = digits.pop(self.site.lang) # This only works if there are only two items in digits dict old = digits[digits.keys()[0]] # do not change inside file links namespaces = list(self.site.namespace(6, all = True)) pattern = re.compile(u'\[\[(' + '|'.join(namespaces) + '):.+?\..+?\]\]', re.UNICODE) exceptions.append(pattern) text = pywikibot.replaceExcept(text, u',', u'،', exceptions) if self.site.lang=='ckb': text = pywikibot.replaceExcept(text, ur'ه([.،_<\]\s])', ur'ە\1', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ە', exceptions) text = pywikibot.replaceExcept(text, u'ه', u'ھ', exceptions) text = pywikibot.replaceExcept(text, u'ك', u'ک', exceptions) text = pywikibot.replaceExcept(text, ur'[ىي]', u'ی', exceptions) # replace persian digits for i in range(0,10): text = pywikibot.replaceExcept(text, old[i], new[i], exceptions) # do not change digits in class, style and table params pattern = re.compile(u'\w+=(".+?"|\d+)', re.UNICODE) exceptions.append(pattern) # do not change digits inside html-tags pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE) exceptions.append(pattern) exceptions.append('table') #exclude tables for now ##fixing pipe and trailing for fa. Thanks ZxxZxxZ if self.site.lang=='fa': faChrs = u'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + u'ًٌٍَُِّْٓٔ' text = re.sub(u'\[\[([^\]\|]*)]]([%s]+)' % faChrs, ur'[[\1|\1\2]]', text) text = re.sub(u'\[\[([^\]\|]*)\|(.+?)]]([%s]+)' % faChrs, ur'[[\1|\2\3]]', text) for i in range(0,10): text = pywikibot.replaceExcept(text, str(i), new[i], exceptions) return text
def sort_TOL_subcat(parent, child): suffix = child.titleWithoutNamespace().replace( parent.titleWithoutNamespace(), u'').lstrip() wikipedia.output(parent.titleWithoutNamespace()) wikipedia.output(suffix) # Replace \[\[[cC]ategory:<parent>[\^]*\]\] # With [[Category:<parent>|<suffix>]] old = u'\[\[[cC]ategory:' + parent.titleWithoutNamespace() + u'[^\]]*\]\]' new = u'[[Category:' + parent.titleWithoutNamespace( ) + u'|' + suffix + u']]' #newgal = u'[[' + child.title() + u'| ]]' newtext = wikipedia.replaceExcept(child.get(), old, new, []) comment = u'Sorting category' #commentgal = u'Moving to category with the same name' wikipedia.showDiff(child.get(), newtext) child.put(newtext, comment) '''
def doReplacements(self, original_text): """ Returns the text which is generated by applying all replacements to the given text. """ new_text = original_text exceptions = [] if "inside-tags" in self.exceptions: exceptions += self.exceptions['inside-tags'] if "inside" in self.exceptions: exceptions += self.exceptions['inside'] for old, new in self.replacements: if self.sleep is not None: time.sleep(self.sleep) new_text = pywikibot.replaceExcept(new_text, old, new, exceptions, allowoverlap=self.allowoverlap) return new_text
def putSpacesInLists(self, text): """ For better readability of bullet list and enumeration wiki source code, puts a space between the * or # and the text. NOTE: This space is recommended in the syntax help on the English, German, and French Wikipedia. It might be that it is not wanted on other wikis. If there are any complaints, please file a bug report. """ exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'timeline'] if not self.redirect and pywikibot.calledModuleName( ) <> 'capitalize_redirects': text = pywikibot.replaceExcept( text, r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)', '\g<bullet> \g<char>', exceptions) return text
def translateAndCapitalizeNamespaces(self, text): """ Makes sure that localized namespace names are used. """ # arz uses english stylish codes if self.site.sitename() == 'wikipedia:arz': return text family = self.site.family # wiki links aren't parsed here. exceptions = ['nowiki', 'comment', 'math', 'pre'] for nsNumber in family.namespaces: if not family.isDefinedNSLanguage(nsNumber, self.site.lang): # Skip undefined namespaces continue if nsNumber in (2, 3): # Skip user namespace, maybe gender is used continue namespaces = list(self.site.namespace(nsNumber, all=True)) thisNs = namespaces.pop(0) if nsNumber == 6 and family.name == 'wikipedia': if self.site.lang in ('en', 'fr'): # do not change "Image" on en-wiki and fr-wiki for image in [u'Image', u'image']: if image in namespaces: namespaces.remove(image) if self.site.lang == 'hu': # do not change "Kép" on hu-wiki for image in [u'Kép', u'kép']: if image in namespaces: namespaces.remove(image) elif self.site.lang == 'pt': # bug #3346901 should be implemented continue # skip main (article) namespace if thisNs and namespaces: text = pywikibot.replaceExcept( text, r'\[\[\s*(%s) *:(?P<nameAndLabel>.*?)\]\]' % '|'.join(namespaces), r'[[%s:\g<nameAndLabel>]]' % thisNs, exceptions) return text
def __iter__(self): try: for entry in self.parser: if self.skipping: if entry.title != self.xmlStart: continue self.skipping = False if not self.isTitleExcepted(entry.title) \ and not self.isTextExcepted(entry.text): new_text = entry.text for old, new in self.replacements: new_text = pywikibot.replaceExcept(new_text, old, new, self.excsInside, self.site) if new_text != entry.text: yield pywikibot.Page(self.site, entry.title) except KeyboardInterrupt: try: if not self.skipping: pywikibot.output( u'To resume, use "-xmlstart:%s" on the command line.' % entry.title) except NameError: pass
def validXhtml(self, text): text = pywikibot.replaceExcept(text, r'(?i)<br[ /]*>', r'<br />', ['comment', 'math', 'nowiki', 'pre']) return text
def cleanUpLinks(self, text): # helper function which works on one link and either returns it # unmodified, or returns a replacement. def handleOneLink(match): titleWithSection = match.group('titleWithSection') label = match.group('label') trailingChars = match.group('linktrail') newline = match.group('newline') if not self.site.isInterwikiLink(titleWithSection): # The link looks like this: # [[page_title|link_text]]trailing_chars # We only work on namespace 0 because pipes and linktrails work # differently for images and categories. try: page = pywikibot.Page(self.site, titleWithSection) #except pywikibot.InvalidTitle: except: #empty self link occures return match.group() if page.namespace() == 0: # Replace underlines by spaces, also multiple underlines titleWithSection = re.sub('_+', ' ', titleWithSection) # Remove double spaces titleWithSection = re.sub(' +', ' ', titleWithSection) # Remove unnecessary leading spaces from title, # but remember if we did this because we eventually want # to re-add it outside of the link later. titleLength = len(titleWithSection) titleWithSection = titleWithSection.lstrip() hadLeadingSpaces = (len(titleWithSection) != titleLength) hadTrailingSpaces = False # Remove unnecessary trailing spaces from title, # but remember if we did this because it may affect # the linktrail and because we eventually want to # re-add it outside of the link later. if not trailingChars: titleLength = len(titleWithSection) titleWithSection = titleWithSection.rstrip() hadTrailingSpaces = (len(titleWithSection) != titleLength) # Convert URL-encoded characters to unicode titleWithSection = pywikibot.url2unicode(titleWithSection, site=self.site) if titleWithSection == '': # just skip empty links. return match.group() # Remove unnecessary initial and final spaces from label. # Please note that some editors prefer spaces around pipes. (See [[en:Wikipedia:Semi-bots]]). We remove them anyway. if label is not None: # Remove unnecessary leading spaces from label, # but remember if we did this because we want # to re-add it outside of the link later. labelLength = len(label) label = label.lstrip() hadLeadingSpaces = (len(label) != labelLength) # Remove unnecessary trailing spaces from label, # but remember if we did this because it affects # the linktrail. if not trailingChars: labelLength = len(label) label = label.rstrip() hadTrailingSpaces = (len(label) != labelLength) else: label = titleWithSection if trailingChars: label += trailingChars if titleWithSection == label or titleWithSection[0].lower( ) + titleWithSection[1:] == label: newLink = "[[%s]]" % label # Check if we can create a link with trailing characters instead of a pipelink elif len(titleWithSection) <= len(label) and label[:len( titleWithSection)] == titleWithSection and re.sub( trailR, '', label[len(titleWithSection):]) == '': newLink = "[[%s]]%s" % (label[:len(titleWithSection)], label[len(titleWithSection):]) else: # Try to capitalize the first letter of the title. # Maybe this feature is not useful for languages that # don't capitalize nouns... #if not self.site.nocapitalize: if self.site.sitename() == 'wikipedia:de': titleWithSection = titleWithSection[0].upper( ) + titleWithSection[1:] newLink = "[[%s|%s]]" % (titleWithSection, label) # re-add spaces that were pulled out of the link. # Examples: # text[[ title ]]text -> text [[title]] text # text[[ title | name ]]text -> text [[title|name]] text # text[[ title |name]]text -> text[[title|name]]text # text[[title| name]]text -> text [[title|name]]text if hadLeadingSpaces and not newline: newLink = ' ' + newLink if hadTrailingSpaces: newLink = newLink + ' ' if newline: newLink = newline + newLink return newLink # don't change anything return match.group() trailR = re.compile(self.site.linktrail()) # The regular expression which finds links. Results consist of four groups: # group title is the target page title, that is, everything before | or ]. # group section is the page section. It'll include the # to make life easier for us. # group label is the alternative link title, that's everything between | and ]. # group linktrail is the link trail, that's letters after ]] which are part of the word. # note that the definition of 'letter' varies from language to language. linkR = re.compile( r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + self.site.linktrail() + ')') text = pywikibot.replaceExcept( text, linkR, handleOneLink, ['comment', 'math', 'nowiki', 'pre', 'startspace']) return text