Beispiel #1
0
    def translateAndCapitalizeNamespaces(self, text):
        """Use localized namespace names."""
        # arz uses english stylish codes
        if self.site.sitename == 'wikipedia:arz':
            return text
        # wiki links aren't parsed here.
        exceptions = ['nowiki', 'comment', 'math', 'pre']

        for namespace in self.site.namespaces.values():
            if namespace == 0:
                # skip main (article) namespace
                continue
            # a clone is needed. Won't change the namespace dict
            namespaces = list(namespace)
            if namespace == 6 and self.site.family.name == 'wikipedia':
                if self.site.code in ('en', 'fr') and MediaWikiVersion(
                        self.site.version()) >= MediaWikiVersion('1.14'):
                    # do not change "Image" on en-wiki and fr-wiki
                    assert u'Image' in namespaces
                    namespaces.remove(u'Image')
                if self.site.code == 'hu':
                    # do not change "Kép" on hu-wiki
                    assert u'Kép' in namespaces
                    namespaces.remove(u'Kép')
                elif self.site.code == 'pt':
                    # use "Imagem" by default on pt-wiki (per T57242)
                    assert 'Imagem' in namespaces
                    namespaces.insert(
                        0, namespaces.pop(namespaces.index('Imagem')))
            # final namespace variant
            final_ns = namespaces.pop(0)
            if namespace in (2, 3):
                # skip localized user namespace, maybe gender is used
                namespaces = ['User' if namespace == 2 else 'User talk']
            # lowerspaced and underscored namespaces
            for i, item in enumerate(namespaces):
                item = item.replace(' ', '[ _]')
                item = u'[%s%s]' % (item[0], item[0].lower()) + item[1:]
                namespaces[i] = item
            namespaces.append(first_lower(final_ns))
            if final_ns and namespaces:
                if self.site.sitename == 'wikipedia:pt' and namespace == 6:
                    # only change on these file extensions (per T57242)
                    extensions = ('png', 'gif', 'jpg', 'jpeg', 'svg', 'tiff',
                                  'tif')
                    text = textlib.replaceExcept(
                        text,
                        r'\[\[\s*({}) *:(?P<name>[^\|\]]*?\.({}))'
                        r'(?P<label>.*?)\]\]'
                        .format('|'.join(namespaces), '|'.join(extensions)),
                        r'[[{}:\g<name>\g<label>]]'.format(final_ns),
                        exceptions)
                else:
                    text = textlib.replaceExcept(
                        text,
                        r'\[\[\s*(%s) *:(?P<nameAndLabel>.*?)\]\]'
                        % '|'.join(namespaces),
                        r'[[%s:\g<nameAndLabel>]]' % final_ns,
                        exceptions)
        return text
    def fixHtml(self, text):
        """Relace html markups with wikitext markups."""
        def replace_header(match):
            """Create a header string for replacing."""
            depth = int(match.group(1))
            return r'{0} {1} {0}'.format('=' * depth, match.group(2))

        # Everything case-insensitive (?i)
        # Keep in mind that MediaWiki automatically converts <br> to <br />
        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
                      'startspace']
        text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>',
                                     r"'''\2'''", exceptions, site=self.site)
        text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>',
                                     r"''\2''", exceptions, site=self.site)
        # horizontal line without attributes in a single line
        text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
                                     r'\1----\2', exceptions)
        # horizontal line with attributes; can't be done with wiki syntax
        # so we only make it XHTML compliant
        text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>',
                                     r'<hr \1 />',
                                     exceptions)
        # a header where only spaces are in the same line
        text = textlib.replaceExcept(
            text,
            r'(?i)(?<=[\r\n]) *<h([1-7])> *([^<]+?) *</h\1> *(?=[\r\n])',
            replace_header,
            exceptions)
        # TODO: maybe we can make the bot replace <p> tags with \r\n's.
        return text
    def fixSyntaxSave(self, text):
        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
                      'startspace']
        # link to the wiki working on
        # TODO: disable this for difflinks and titled links,
        # to prevent edits like this:
        # https://de.wikipedia.org/w/index.php?title=Wikipedia%3aVandalismusmeldung&diff=103109563&oldid=103109271
#        text = textlib.replaceExcept(text,
#                                     r'\[https?://%s\.%s\.org/wiki/(?P<link>\S+)\s+(?P<title>.+?)\s?\]'
#                                     % (self.site.code, self.site.family.name),
#                                     r'[[\g<link>|\g<title>]]', exceptions)
        # external link in double brackets
        text = textlib.replaceExcept(
            text,
            r'\[\[(?P<url>https?://[^\]]+?)\]\]',
            r'[\g<url>]', exceptions)
        # external link starting with double bracket
        text = textlib.replaceExcept(text,
                                     r'\[\[(?P<url>https?://.+?)\]',
                                     r'[\g<url>]', exceptions)
        # external link and description separated by a dash, with
        # whitespace in front of the dash, so that it is clear that
        # the dash is not a legitimate part of the URL.
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)
        # dash in external link, where the correct end of the URL can
        # be detected from the file extension. It is very unlikely that
        # this will cause mistakes.
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] ]+?(\.pdf|\.html|\.htm|\.php|\.asp|\.aspx|\.jsp)) *\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)
        return text
Beispiel #4
0
    def fixSyntaxSave(self, text):
        def replace_link(match):
            replacement = '[[' + match.group('link')
            if match.group('title'):
                replacement += '|' + match.group('title')
            return replacement + ']]'

        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
                      'startspace']
        # link to the wiki working on
        # Only use suffixes for article paths
        for suffix in self.site._interwiki_urls(True):
            http_url = self.site.base_url(suffix, 'http')
            if self.site.protocol() == 'http':
                https_url = None
            else:
                https_url = self.site.base_url(suffix, 'https')
            # compare strings without the protocol, if they are empty support
            # also no prefix (//en.wikipedia.org/…)
            if https_url is not None and http_url[4:] == https_url[5:]:
                urls = ['(?:https?:)?' + re.escape(http_url[5:])]
            else:
                urls = [re.escape(url) for url in (http_url, https_url)
                        if url is not None]
            for url in urls:
                # Only include links which don't include the separator as
                # the wikilink won't support additional parameters
                separator = '?'
                if '?' in suffix:
                    separator += '&'
                # Match first a non space in the title to prevent that multiple
                # spaces at the end without title will be matched by it
                text = textlib.replaceExcept(
                    text,
                    r'\[\[?' + url + r'(?P<link>[^' + separator + r']+?)'
                    r'(\s+(?P<title>[^\s].*?))?\s*\]\]?',
                    replace_link, exceptions, site=self.site)
        # external link in/starting with double brackets
        text = textlib.replaceExcept(
            text,
            r'\[\[(?P<url>https?://[^\]]+?)\]\]?',
            r'[\g<url>]', exceptions, site=self.site)
        # external link and description separated by a pipe, with
        # whitespace in front of the pipe, so that it is clear that
        # the dash is not a legitimate part of the URL.
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)
        # dash in external link, where the correct end of the URL can
        # be detected from the file extension. It is very unlikely that
        # this will cause mistakes.
        extensions = [r'\.{0}'.format(ext)
                      for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']]
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *'
            r'\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)
        return text
Beispiel #5
0
    def test_replace_template(self):
        """Test replacing not inside templates."""
        template_sample = (r'a {{templatename '
                           r'    | accessdate={{Fecha|1993}} '
                           r'    |atitle=The [[real title]] }}')
        self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
                                               ['template'], site=self.site),
                         'X' + template_sample[1:])

        template_sample = (r'a {{templatename '
                           r'    | 1={{a}}2{{a}} '
                           r'    | 2={{a}}1{{a}} }}')
        self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
                                               ['template'], site=self.site),
                         'X' + template_sample[1:])

        template_sample = (r'a {{templatename '
                           r'    | 1={{{a}}}2{{{a}}} '
                           r'    | 2={{{a}}}1{{{a}}} }}')
        self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
                                               ['template'], site=self.site),
                         'X' + template_sample[1:])

        # sf.net bug 1575: unclosed template
        template_sample = template_sample[:-2]
        self.assertEqual(textlib.replaceExcept(template_sample, 'a', 'X',
                                               ['template'], site=self.site),
                         'X' + template_sample[1:])
Beispiel #6
0
 def fixHtml(self, text):
     # Everything case-insensitive (?i)
     # Keep in mind that MediaWiki automatically converts <br> to <br />
     exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
                   'startspace']
     text = textlib.replaceExcept(text, r'(?i)<(b|strong)>(.*?)</\1>',
                                  r"'''\2'''", exceptions, site=self.site)
     text = textlib.replaceExcept(text, r'(?i)<(i|em)>(.*?)</\1>',
                                  r"''\2''", exceptions, site=self.site)
     # horizontal line without attributes in a single line
     text = textlib.replaceExcept(text, r'(?i)([\r\n])<hr[ /]*>([\r\n])',
                                  r'\1----\2', exceptions)
     # horizontal line with attributes; can't be done with wiki syntax
     # so we only make it XHTML compliant
     text = textlib.replaceExcept(text, r'(?i)<hr ([^>/]+?)>',
                                  r'<hr \1 />',
                                  exceptions)
     # a header where only spaces are in the same line
     for level in range(1, 7):
         equals = '\\1%s \\2 %s\\3' % ("=" * level, "=" * level)
         text = textlib.replaceExcept(
             text,
             r'(?i)([\r\n]) *<h%d> *([^<]+?) *</h%d> *([\r\n])'
             % (level, level),
             r'%s' % equals,
             exceptions)
     # TODO: maybe we can make the bot replace <p> tags with \r\n's.
     return text
    def commonsfiledesc(self, text):
        """
        Clean up file descriptions on the Wikimedia Commons.

        It is working according to [1] and works only on pages in the file
        namespace on the Wikimedia Commons.

        [1]: https://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup
        """
        if self.site.sitename != 'commons:commons' or self.namespace == 6:
            return
        # section headers to {{int:}} versions
        exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki',
                      'pre', 'source', 'ref', 'timeline']
        text = textlib.replaceExcept(text,
                                     r"([\r\n]|^)\=\= *Summary *\=\=",
                                     r"\1== {{int:filedesc}} ==",
                                     exceptions, True)
        text = textlib.replaceExcept(
            text,
            r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=",
            r"\1== {{int:license-header}} ==", exceptions, True)
        text = textlib.replaceExcept(
            text,
            r"([\r\n])\=\= *(Licensing|License information|{{int:license}}) *\=\=",
            r"\1== {{int:license-header}} ==", exceptions, True)

        # frequent field values to {{int:}} versions
        text = textlib.replaceExcept(
            text,
            r'([\r\n]\|[Ss]ource *\= *)'
            r'(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *([\r\n])',
            r'\1{{own}}\2', exceptions, True)
        text = textlib.replaceExcept(
            text,
            r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])',
            r'\1\2', exceptions, True)

        # added to transwikied pages
        text = textlib.replaceExcept(text, r'__NOTOC__', '', exceptions, True)

        # tracker element for js upload form
        text = textlib.replaceExcept(
            text,
            r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->',
            '', exceptions[1:], True)
        text = textlib.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}',
                                     '', exceptions, True)

        # duplicated section headers
        text = textlib.replaceExcept(
            text,
            r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *{{int:filedesc}} *\=\=',
            r'\1== {{int:filedesc}} ==', exceptions, True)
        text = textlib.replaceExcept(
            text,
            r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)'
            r'\=\= *{{int:license-header}} *\=\=',
            r'\1== {{int:license-header}} ==', exceptions, True)
        return text
 def removeUselessSpaces(self, text):
     multipleSpacesR = re.compile('  +')
     spaceAtLineEndR = re.compile(' $')
     exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table',
                   'template']
     text = textlib.replaceExcept(text, multipleSpacesR, ' ', exceptions)
     text = textlib.replaceExcept(text, spaceAtLineEndR, '', exceptions)
     return text
 def test_replace_source_reference(self):
     """Test replacing in text which contains back references."""
     # Don't use a valid reference number in the original string, in case it
     # tries to apply that as a reference.
     self.assertEqual(textlib.replaceExcept(r"\42", r"^(.*)$", r"X\1X", [], site=self.site), r"X\42X")
     self.assertEqual(
         textlib.replaceExcept(r"\g<bar>", r"^(?P<foo>.*)$", r"X\g<foo>X", [], site=self.site), r"X\g<bar>X"
     )
Beispiel #10
0
 def test_replace_exception(self):
     self.assertEqual(textlib.replaceExcept('123x123', '123', '000', [],
                                            site=self.site),
                      '000x000')
     self.assertEqual(textlib.replaceExcept('123x123', '123', '000',
                                            [re.compile(r'\w123')],
                                            site=self.site),
                      '000x123')
Beispiel #11
0
 def test_overlapping_replace(self):
     self.assertEqual(textlib.replaceExcept('1111', '11', '21', [],
                                            allowoverlap=False,
                                            site=self.site),
                      '2121')
     self.assertEqual(textlib.replaceExcept('1111', '11', '21', [],
                                            allowoverlap=True,
                                            site=self.site),
                      '2221')
Beispiel #12
0
 def test_replace_with_marker(self):
     self.assertEqual(textlib.replaceExcept('AxyxB', 'x', 'y', [],
                                            marker='.',
                                            site=self.site),
                      'Ayyy.B')
     self.assertEqual(textlib.replaceExcept('AxyxB', '1', 'y', [],
                                            marker='.',
                                            site=self.site),
                      'AxyxB.')
Beispiel #13
0
 def test_replace_exception(self):
     """Test replacing not inside a specific regex."""
     self.assertEqual(textlib.replaceExcept('123x123', '123', '000', [],
                                            site=self.site),
                      '000x000')
     self.assertEqual(textlib.replaceExcept('123x123', '123', '000',
                                            [re.compile(r'\w123')],
                                            site=self.site),
                      '000x123')
Beispiel #14
0
 def test_overlapping_replace(self):
     """Test replacing with and without overlap."""
     self.assertEqual(textlib.replaceExcept('1111', '11', '21', [],
                                            allowoverlap=False,
                                            site=self.site),
                      '2121')
     self.assertEqual(textlib.replaceExcept('1111', '11', '21', [],
                                            allowoverlap=True,
                                            site=self.site),
                      '2221')
Beispiel #15
0
 def test_simple_replace(self):
     self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [],
                                            site=self.site),
                      'AyB')
     self.assertEqual(textlib.replaceExcept('AxxB', 'x', 'y', [],
                                            site=self.site),
                      'AyyB')
     self.assertEqual(textlib.replaceExcept('AxyxB', 'x', 'y', [],
                                            site=self.site),
                      'AyyyB')
    def test_replace_tags_interwiki(self):
        if "es" not in self.site.family.langs or "ey" in self.site.family.langs:
            raise unittest.SkipTest("family %s doesnt have languages" % self.site)

        self.assertEqual(
            textlib.replaceExcept("[[es:s]]", "s", "t", ["interwiki"], site=self.site), "[[es:s]]"
        )  # "es" is a valid interwiki code
        self.assertEqual(
            textlib.replaceExcept("[[ex:x]]", "x", "y", ["interwiki"], site=self.site), "[[ey:y]]"
        )  # "ex" is not a valid interwiki code
Beispiel #17
0
 def removeUselessSpaces(self, text):
     """Cleanup multiple or trailing spaces."""
     multipleSpacesR = re.compile('  +')
     spaceAtLineEndR = re.compile(' $')
     exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table']
     if self.site.sitename != 'wikipedia:cs':
         exceptions.append('template')
     text = textlib.replaceExcept(text, multipleSpacesR, ' ', exceptions)
     text = textlib.replaceExcept(text, spaceAtLineEndR, '', exceptions)
     return text
Beispiel #18
0
 def test_simple_replace(self):
     """Test replacing without regex."""
     self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [],
                                            site=self.site),
                      'AyB')
     self.assertEqual(textlib.replaceExcept('AxxB', 'x', 'y', [],
                                            site=self.site),
                      'AyyB')
     self.assertEqual(textlib.replaceExcept('AxyxB', 'x', 'y', [],
                                            site=self.site),
                      'AyyyB')
Beispiel #19
0
    def test_replace_tags_interwiki(self):
        if 'es' not in self.site.family.langs or 'ey' in self.site.family.langs:
            raise unittest.SkipTest('family %s doesnt have languages'
                                    % self.site)

        self.assertEqual(textlib.replaceExcept('[[es:s]]', 's', 't',
                                               ['interwiki'], site=self.site),
                         '[[es:s]]')  # "es" is a valid interwiki code
        self.assertEqual(textlib.replaceExcept('[[ex:x]]', 'x', 'y',
                                               ['interwiki'], site=self.site),
                         '[[ey:y]]')  # "ex" is not a valid interwiki code
    def fixReferences(self, text):
        """Fix references tags."""
        # See also https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
        exceptions = ["nowiki", "comment", "math", "pre", "source", "startspace"]

        # it should be name = " or name=" NOT name   ="
        text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
        # remove empty <ref/>-tag
        text = textlib.replaceExcept(text, r"(?i)(<ref\s*/>|<ref *>\s*</ref>)", r"", exceptions)
        text = textlib.replaceExcept(text, r"(?i)<ref\s+([^>]+?)\s*>\s*</ref>", r"<ref \1/>", exceptions)
        return text
    def fixReferences(self, text):
        # See also https://en.wikipedia.org/wiki/User:AnomieBOT/source/tasks/OrphanReferenceFixer.pm
        exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
                      'startspace']

        # it should be name = " or name=" NOT name   ="
        text = re.sub(r'(?i)<ref +name(= *| *=)"', r'<ref name="', text)
        # remove empty <ref/>-tag
        text = textlib.replaceExcept(text,
                                     r'(?i)(<ref\s*/>|<ref *>\s*</ref>)',
                                     r'', exceptions)
        text = textlib.replaceExcept(text,
                                     r'(?i)<ref\s+([^>]+?)\s*>\s*</ref>',
                                     r'<ref \1/>', exceptions)
        return text
 def removeUselessSpaces(self, text):
     """Cleanup multiple or trailing spaces."""
     exceptions = ["comment", "math", "nowiki", "pre", "startspace", "table"]
     if self.site.sitename != "wikipedia:cs":
         exceptions.append("template")
     text = textlib.replaceExcept(text, r"(?m) +( |$)", r"\1", exceptions, site=self.site)
     return text
 def translateMagicWords(self, text):
     """Use localized magic words."""
     # not wanted at ru
     # arz uses english stylish codes
     if self.site.code not in ["arz", "ru"]:
         exceptions = ["nowiki", "comment", "math", "pre"]
         for magicWord in [
             "img_thumbnail",
             "img_left",
             "img_center",
             "img_right",
             "img_none",
             "img_framed",
             "img_frameless",
             "img_border",
             "img_upright",
         ]:
             aliases = self.site.getmagicwords(magicWord)
             if not aliases:
                 continue
             text = textlib.replaceExcept(
                 text,
                 r"\[\[(?P<left>.+?:.+?\..+?\|) *(" + "|".join(aliases) + r") *(?P<right>(\|.*?)?\]\])",
                 r"[[\g<left>" + aliases[0] + r"\g<right>",
                 exceptions,
             )
     return text
    def putSpacesInLists(self, text):
        """
        Add a space between the * or # and the text.

        NOTE: This space is recommended in the syntax help on the English,
        German, and French Wikipedia. It might be that it is not wanted on other
        wikis. If there are any complaints, please file a bug report.
        """
        if not self.template:
            exceptions = [
                "comment",
                "math",
                "nowiki",
                "pre",
                "source",
                "template",
                "timeline",
                self.site.redirectRegex(),
            ]
            text = textlib.replaceExcept(
                text,
                r"(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)",
                r"\g<bullet> \g<char>",
                exceptions,
            )
        return text
Beispiel #25
0
 def fixTypo(self, text):
     exceptions = ['nowiki', 'comment', 'math', 'pre', 'source',
                   'startspace', 'gallery', 'hyperlink', 'interwiki', 'link']
     # change <number> ccm -> <number> cm³
     text = textlib.replaceExcept(text, r'(\d)\s*(?:&nbsp;)?ccm',
                                  r'\1&nbsp;cm³', exceptions,
                                  site=self.site)
     # Solve wrong Nº sign with °C or °F
     # additional exception requested on fr-wiki for this stuff
     pattern = re.compile(u'«.*?»', re.UNICODE)
     exceptions.append(pattern)
     text = textlib.replaceExcept(text, r'(\d)\s*(?:&nbsp;)?[º°]([CF])',
                                  r'\1&nbsp;°\2', exceptions, site=self.site)
     text = textlib.replaceExcept(text, u'º([CF])', u'°' + r'\1',
                                  exceptions)
     return text
 def apply(self, text, page):
     exceptions = list(set(self.exceptions + [
         'imagemap', 'includeonly', 'timeline']) - {'startspace'})
     title = page.title()
     return textlib.replaceExcept(
         text, r"(?P<before>''')?\[\[(?P<inside>[^]]+)\]\](?P<after>''')?",
         lambda m: self.replacement(m, title), exceptions, site=page.site)
    def __iter__(self):
        """Iterator method."""
        try:
            for entry in self.parser:
                if self.skipping:
                    if entry.title != self.xmlStart:
                        continue
                    self.skipping = False
                if self.isTitleExcepted(entry.title) or self.isTextExcepted(entry.text):
                    continue
                new_text = entry.text
                for replacement in self.replacements:
                    # This doesn't do an actual replacement but just
                    # checks if at least one does apply
                    new_text = textlib.replaceExcept(
                        new_text,
                        replacement.old_regex,
                        replacement.new,
                        self.excsInside + replacement.get_inside_exceptions(),
                        site=self.site,
                    )
                if new_text != entry.text:
                    yield pywikibot.Page(self.site, entry.title)

        except KeyboardInterrupt:
            try:
                if not self.skipping:
                    pywikibot.output('To resume, use "-xmlstart:%s" on the command line.' % entry.title)
            except NameError:
                pass
Beispiel #28
0
    def translateMagicWords(self, text):
        """Use localized magic words."""
        # not wanted at ru
        # arz uses english stylish codes
        # no need to run on English wikis
        if self.site.code not in ['arz', 'en', 'ru']:
            def replace_magicword(match):
                split = match.group().split('|')
                # push ']]' out and re-add below
                split[-1] = split[-1][:-2]
                for magicword in ['img_thumbnail', 'img_left', 'img_center',
                                  'img_right', 'img_none', 'img_framed',
                                  'img_frameless', 'img_border', 'img_upright',
                                  ]:
                    aliases = list(self.site.getmagicwords(magicword))
                    preferred = aliases.pop(0)
                    if not aliases:
                        continue
                    split[1:] = list(map(
                        lambda x: preferred if x.strip() in aliases else x,
                        split[1:]))
                return '|'.join(split) + ']]'

            exceptions = ['nowiki', 'comment', 'math', 'pre', 'source']
            regex = re.compile(
                FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]),
                flags=re.X)
            text = textlib.replaceExcept(text, regex, replace_magicword,
                                         exceptions)
        return text
Beispiel #29
0
 def test_case_sensitive(self):
     self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [],
                                            caseInsensitive=False,
                                            site=self.site),
                      'AyB')
     self.assertEqual(textlib.replaceExcept('AxB', 'X', 'y', [],
                                            caseInsensitive=False,
                                            site=self.site),
                      'AxB')
     self.assertEqual(textlib.replaceExcept('AxB', 'x', 'y', [],
                                            caseInsensitive=True,
                                            site=self.site),
                      'AyB')
     self.assertEqual(textlib.replaceExcept('AxB', 'X', 'y', [],
                                            caseInsensitive=True,
                                            site=self.site),
                      'AyB')
 def removeUselessSpaces(self, text):
     """Cleanup multiple or trailing spaces."""
     exceptions = ['comment', 'math', 'nowiki', 'pre', 'startspace', 'table']
     if self.site.sitename != 'wikipedia:cs':
         exceptions.append('template')
     text = textlib.replaceExcept(text, r'(?m) +( |$)', r'\1', exceptions,
                                  site=self.site)
     return text
Beispiel #31
0
 def removeUselessSpaces(self, text):
     """Cleanup multiple or trailing spaces."""
     exceptions = [
         'comment', 'math', 'nowiki', 'pre', 'syntaxhighlight',
         'startspace', 'table'
     ]
     if self.site.sitename != 'wikipedia:cs':
         exceptions.append('template')
     text = textlib.replaceExcept(text,
                                  r'(?m)[\t ]+( |$)',
                                  r'\1',
                                  exceptions,
                                  site=self.site)
     return text
Beispiel #32
0
 def fixTypo(self, text):
     exceptions = [
         'nowiki', 'comment', 'math', 'pre', 'source', 'startspace',
         'gallery', 'hyperlink', 'interwiki', 'link'
     ]
     # change <number> ccm -> <number> cm³
     text = textlib.replaceExcept(text,
                                  r'(\d)\s*(?:&nbsp;)?ccm',
                                  r'\1&nbsp;cm³',
                                  exceptions,
                                  site=self.site)
     # Solve wrong Nº sign with °C or °F
     # additional exception requested on fr-wiki for this stuff
     pattern = re.compile(u'«.*?»', re.UNICODE)
     exceptions.append(pattern)
     text = textlib.replaceExcept(text,
                                  r'(\d)\s*(?:&nbsp;)?[º°]([CF])',
                                  r'\1&nbsp;°\2',
                                  exceptions,
                                  site=self.site)
     text = textlib.replaceExcept(text, u'º([CF])', u'°' + r'\1',
                                  exceptions)
     return text
Beispiel #33
0
    def cleanUpSectionHeaders(self, text):
        """
        Add a space between the equal signs and the section title.

        Example: ==Section title== becomes == Section title ==

        NOTE: This space is recommended in the syntax help on the English and
        German Wikipedia. It might be that it is not wanted on other wikis.
        If there are any complaints, please file a bug report.
        """
        return textlib.replaceExcept(
            text, r'(?m)^(={1,7}) *(?P<title>[^=]+?) *\1 *\r?\n',
            r'\1 \g<title> \1%s' % config.LS,
            ['comment', 'math', 'nowiki', 'pre'])
Beispiel #34
0
    def fixTypo(self, text: str) -> str:
        """Fix units."""
        exceptions = [
            'comment',
            'gallery',
            'hyperlink',
            'interwiki',
            'link',
            'nowiki',
            'math',
            'pre',
            'startspace',
            'syntaxhighlight',
        ]  # type: List[Union[str, Pattern[str]]]

        # change <number> ccm -> <number> cm³
        text = textlib.replaceExcept(text,
                                     r'(\d)\s*(?:&nbsp;)?ccm',
                                     r'\1&nbsp;cm³',
                                     exceptions,
                                     site=self.site)
        # Solve wrong Nº sign with °C or °F
        # additional exception requested on fr-wiki for this stuff
        pattern = re.compile('«.*?»')
        exceptions.append(pattern)
        text = textlib.replaceExcept(text,
                                     r'(\d)\s*(?:&nbsp;)?[º°]([CF])',
                                     r'\1&nbsp;°\2',
                                     exceptions,
                                     site=self.site)
        text = textlib.replaceExcept(text,
                                     'º([CF])',
                                     '°' + r'\1',
                                     exceptions,
                                     site=self.site)
        return text
Beispiel #35
0
    def test_replace_template(self):
        """Test replacing not inside templates."""
        template_sample = (r'a {{templatename '
                           r'    | accessdate={{Fecha|1993}} '
                           r'    |atitle=The [[real title]] }}')
        self.assertEqual(
            textlib.replaceExcept(template_sample,
                                  'a',
                                  'X', ['template'],
                                  site=self.site), 'X' + template_sample[1:])

        template_sample = (r'a {{templatename '
                           r'    | 1={{a}}2{{a}} '
                           r'    | 2={{a}}1{{a}} }}')
        self.assertEqual(
            textlib.replaceExcept(template_sample,
                                  'a',
                                  'X', ['template'],
                                  site=self.site), 'X' + template_sample[1:])

        template_sample = (r'a {{templatename '
                           r'    | 1={{{a}}}2{{{a}}} '
                           r'    | 2={{{a}}}1{{{a}}} }}')
        self.assertEqual(
            textlib.replaceExcept(template_sample,
                                  'a',
                                  'X', ['template'],
                                  site=self.site), 'X' + template_sample[1:])

        # sf.net bug 1575: unclosed template
        template_sample = template_sample[:-2]
        self.assertEqual(
            textlib.replaceExcept(template_sample,
                                  'a',
                                  'X', ['template'],
                                  site=self.site), 'X' + template_sample[1:])
Beispiel #36
0
 def apply(self, text, replaced=list()):
     hook = lambda match: self.summary_hook(match, replaced)
     start = time.clock()
     text = textlib.replaceExcept(text,
                                  self.find,
                                  hook,
                                  self.exceptions,
                                  site=self.site)
     finish = time.clock()
     delta = finish - start
     self.longest = max(delta, self.longest)
     if delta > 5:
         pywikibot.warning('Slow typo rule "%s" (%f)' %
                           (self.find.pattern, delta))
     return text
Beispiel #37
0
    def apply_replacements(self, original_text, applied, page=None):
        """
        Apply all replacements to the given text.

        @rtype: unicode, set
        """
        if page is None:
            pywikibot.warn(
                'You must pass the target page as the "page" parameter to '
                'apply_replacements().',
                DeprecationWarning,
                stacklevel=2)
        new_text = original_text
        exceptions = _get_text_exceptions(self.exceptions)
        skipped_containers = set()
        for replacement in self.replacements:
            if self.sleep is not None:
                time.sleep(self.sleep)
            if (replacement.container
                    and replacement.container.name in skipped_containers):
                continue
            elif page is not None and self.isTitleExcepted(
                    page.title(), replacement.exceptions):
                if replacement.container:
                    pywikibot.output(
                        'Skipping fix "{0}" on {1} because the title is on '
                        'the exceptions list.'.format(
                            replacement.container.name,
                            page.title(as_link=True)))
                    skipped_containers.add(replacement.container.name)
                else:
                    pywikibot.output(
                        'Skipping unnamed replacement ({0}) on {1} because '
                        'the title is on the exceptions list.'.format(
                            replacement.description, page.title(as_link=True)))
                continue
            old_text = new_text
            new_text = textlib.replaceExcept(
                new_text,
                replacement.old_regex,
                replacement.new,
                exceptions + replacement.get_inside_exceptions(),
                allowoverlap=self.allowoverlap,
                site=self.site)
            if old_text != new_text:
                applied.add(replacement)

        return new_text
Beispiel #38
0
 def treat_page(self):
     """Process one page."""
     self.check_disabled()
     text = ''
     for section in split_into_sections(self.current_page.text):
         for identifier in ('ISBN', 'PMID', 'RFC'):
             if self.getOption(identifier):
                 section = replaceExcept(
                     section,
                     _regexes[identifier],
                     self.getOption(identifier),
                     self.replace_exceptions,
                     site=self.site,
                 )
         text += section
     self.put_current(text, summary=self.getOption('summary'))
Beispiel #39
0
    def cleanUpSectionHeaders(self, text):
        """
        Add a space between the equal signs and the section title.

        Example: ==Section title== becomes == Section title ==

        NOTE: This space is recommended in the syntax help on the English and
        German Wikipedia. It is not wanted on Lojban and English Wiktionary
        (T168399, T169064) and it might be that it is not wanted on other
        wikis. If there are any complaints, please file a bug report.
        """
        if self.site.sitename in ['wiktionary:jbo', 'wiktionary:en']:
            return text
        return textlib.replaceExcept(
            text, r'(?m)^(={1,6})[ \t]*(?P<title>.*[^\s=])[ \t]*\1[ \t]*\r?\n',
            r'\1 \g<title> \1\n', ['comment', 'math', 'nowiki', 'pre'])
Beispiel #40
0
def usersToCheck():
    optInPage = pywikibot.Page(pywikibot.Site(), wpOptInList)
    optInRawText = optInPage.get()

    p = re.compile(wpOptInListRegEx, re.UNICODE)
    userIterator = p.finditer(optInRawText)
    result = []
    for user in userIterator:
        # "_" is the same as " " for Wikipedia URls
        username = textlib.replaceExcept(user.group('username'), u"_", u" ",
                                         [])
        if len(username) == 1:
            username = username[0].capitalize()
        elif len(username) > 1:
            username = username[0].capitalize() + username[1:]
        result.append(username)
    return result
Beispiel #41
0
    def putSpacesInLists(self, text):
        """
        Add a space between the * or # and the text.

        NOTE: This space is recommended in the syntax help on the English,
        German, and French Wikipedia. It might be that it is not wanted on other
        wikis. If there are any complaints, please file a bug report.
        """
        if not self.template:
            exceptions = ['comment', 'math', 'nowiki', 'pre', 'source', 'template',
                          'timeline', self.site.redirectRegex()]
            text = textlib.replaceExcept(
                text,
                r'(?m)^(?P<bullet>[:;]*(\*+|#+)[:;\*#]*)(?P<char>[^\s\*#:;].+?)',
                r'\g<bullet> \g<char>',
                exceptions)
        return text
Beispiel #42
0
 def doReplacements(self, original_text):
     """
     Returns the text which is generated by applying all replacements to
     the given text.
     """
     new_text = original_text
     exceptions = []
     if "inside-tags" in self.exceptions:
         exceptions += self.exceptions['inside-tags']
     if "inside" in self.exceptions:
         exceptions += self.exceptions['inside']
     for old, new in self.replacements:
         if self.sleep is not None:
             time.sleep(self.sleep)
         new_text = textlib.replaceExcept(new_text, old, new, exceptions,
                                          allowoverlap=self.allowoverlap,
                                          site=self.site)
     return new_text
Beispiel #43
0
 def replaceDeprecatedTemplates(self, text):
     exceptions = ['comment', 'math', 'nowiki', 'pre']
     if self.site.family.name in deprecatedTemplates and \
        self.site.code in deprecatedTemplates[self.site.family.name]:
         for template in deprecatedTemplates[
                 self.site.family.name][self.site.code]:
             old = template[0]
             new = template[1]
             if new is None:
                 new = ''
             else:
                 new = '{{%s}}' % new
             if self.site.namespaces[10].case == 'first-letter':
                 old = '[' + old[0].upper() + old[0].lower() + ']' + old[1:]
             text = textlib.replaceExcept(
                 text,
                 r'\{\{([mM][sS][gG]:)?%s(?P<parameters>\|[^}]+|)}}' % old,
                 new, exceptions)
Beispiel #44
0
 def translateMagicWords(self, text):
     """Use localized magic words."""
     # not wanted at ru
     # arz uses english stylish codes
     if self.site.code not in ['arz', 'ru']:
         exceptions = ['nowiki', 'comment', 'math', 'pre']
         for magicWord in ['img_thumbnail', 'img_left', 'img_center',
                           'img_right', 'img_none', 'img_framed',
                           'img_frameless', 'img_border', 'img_upright', ]:
             aliases = self.site.getmagicwords(magicWord)
             if not aliases:
                 continue
             text = textlib.replaceExcept(
                 text,
                 r'\[\[(?P<left>.+?:.+?\..+?\|) *(' + '|'.join(aliases) +
                 r') *(?P<right>(\|.*?)?\]\])',
                 r'[[\g<left>' + aliases[0] + r'\g<right>', exceptions)
     return text
Beispiel #45
0
    def translateMagicWords(self, text: str) -> str:
        """Use localized magic words."""
        # not wanted at ru
        # arz uses English stylish codes
        # no need to run on English wikis
        if self.site.code in ['arz', 'en', 'ru']:
            return text

        def init_cache() -> None:
            for magicword in ('img_thumbnail', 'img_left', 'img_center',
                              'img_right', 'img_none', 'img_framed',
                              'img_frameless', 'img_border', 'img_upright',
                              'img_baseline', 'img_sub', 'img_super',
                              'img_top', 'img_text_top', 'img_middle',
                              'img_bottom', 'img_text_bottom'):
                aliases = self.site.getmagicwords(magicword)
                if len(aliases) > 1:
                    cache.update((alias, aliases[0]) for alias in aliases[1:]
                                 if '$1' not in alias)
            if not cache:
                cache[False] = True  # signal there is nothing to replace

        def replace_magicword(match: Match[str]) -> str:
            if cache.get(False):
                return match.group()
            split = match.group().split('|')
            if len(split) == 1:
                return match.group()

            if not cache:
                init_cache()

            # push ']]' out and re-add below
            split[-1] = split[-1][:-2]
            return '{}|{}]]'.format(
                split[0], '|'.join(cache.get(x.strip(), x) for x in split[1:]))

        cache = {}  # type: Dict[Union[bool, str], Any]
        exceptions = ['comment', 'nowiki', 'pre', 'syntaxhighlight']
        regex = re.compile(
            FILE_LINK_REGEX % '|'.join(self.site.namespaces[6]),
            flags=re.X)
        return textlib.replaceExcept(
            text, regex, replace_magicword, exceptions)
Beispiel #46
0
    def translateAndCapitalizeNamespaces(self, text):
        """Use localized namespace names."""
        # arz uses english stylish codes
        if self.site.sitename() == 'wikipedia:arz':
            return text
        family = self.site.family
        # wiki links aren't parsed here.
        exceptions = ['nowiki', 'comment', 'math', 'pre']

        for nsNumber in self.site.namespaces():
            if nsNumber in (0, 2, 3):
                # skip main (article) namespace
                # skip user namespace, maybe gender is used
                continue
            # a clone is needed. Won't change the namespace dict
            namespaces = list(self.site.namespace(nsNumber, all=True))
            thisNs = namespaces.pop(0)
            if nsNumber == 6 and family.name == 'wikipedia':
                if self.site.code in ('en', 'fr') and \
                   MediaWikiVersion(self.site.version()) >= MediaWikiVersion('1.14'):
                    # do not change "Image" on en-wiki and fr-wiki
                    assert u'Image' in namespaces
                    namespaces.remove(u'Image')
                if self.site.code == 'hu':
                    # do not change "Kép" on hu-wiki
                    assert u'Kép' in namespaces
                    namespaces.remove(u'Kép')
                elif self.site.code == 'pt':
                    # bug 55242 should be implemented
                    continue
            # lowerspaced and underscored namespaces
            for i in range(len(namespaces)):
                item = namespaces[i].replace(' ', '[ _]')
                item = u'[%s%s]' % (item[0], item[0].lower()) + item[1:]
                namespaces[i] = item
            namespaces.append(thisNs[0].lower() + thisNs[1:])
            if thisNs and namespaces:
                text = textlib.replaceExcept(
                    text,
                    r'\[\[\s*(%s) *:(?P<nameAndLabel>.*?)\]\]'
                    % '|'.join(namespaces),
                    r'[[%s:\g<nameAndLabel>]]' % thisNs,
                    exceptions)
        return text
Beispiel #47
0
 def treat_page(self) -> None:
     """Process one page."""
     cats = list()
     old_cat_link = None
     wikicode = mwparserfromhell.parse(self.current_page.text,
                                       skip_style_tags=True)
     for link in wikicode.ifilter_wikilinks():
         if link.title.strip().startswith(':'):
             continue
         try:
             link_page = pywikibot.Page(self.site, str(link.title))
             link_cat = pywikibot.Category(link_page)
         except (ValueError, pywikibot.Error):
             continue
         cats.append(link_cat)
         if link_cat == self.getOption('old_cat'):
             old_cat_link = link
     if not old_cat_link:
         pywikibot.log('Did not find {} in {}.'.format(
             self.getOption('old_cat'), self.current_page))
         return
     new_cats = self.getOption('new_cats')
     if len(new_cats) == 1 and new_cats[0] not in cats:
         # Update the title to keep the sort key.
         old_cat_link.title = new_cats[0].title()
         text = str(wikicode)
     else:
         for cat in new_cats:
             if cat not in cats:
                 wikicode.insert_after(old_cat_link, '\n' + cat.aslink())
         old_cat_regex = re.compile(r'\n?' + re.escape(str(old_cat_link)),
                                    re.M)
         text = replaceExcept(str(wikicode),
                              old_cat_regex,
                              '',
                              EXCEPTIONS,
                              site=self.site)
     self.put_current(
         text,
         summary=self.getOption('summary'),
         asynchronous=False,
         nocreate=True,
     )
    def replaceDeprecatedTemplates(self, text):
        """Replace deprecated templates."""
        exceptions = ['comment', 'math', 'nowiki', 'pre']
        builder = _MultiTemplateMatchBuilder(self.site)

        if self.site.family.name in deprecatedTemplates and \
           self.site.code in deprecatedTemplates[self.site.family.name]:
            for template in deprecatedTemplates[self.site.family.name][
                    self.site.code]:
                old, new = template
                if new is None:
                    new = ''
                else:
                    new = '{{%s}}' % new

                text = textlib.replaceExcept(text, builder.pattern(old), new,
                                             exceptions)

        return text
    def translateMagicWords(self, text):
        """Use localized magic words."""
        # not wanted at ru
        # arz uses english stylish codes
        # no need to run on English wikis
        if self.site.code not in ['arz', 'en', 'ru']:

            def replace_magicword(match):
                split = match.group().split('|')
                # push ']]' out and re-add below
                split[-1] = split[-1][:-2]
                for magicword in [
                        'img_thumbnail',
                        'img_left',
                        'img_center',
                        'img_right',
                        'img_none',
                        'img_framed',
                        'img_frameless',
                        'img_border',
                        'img_upright',
                ]:
                    aliases = list(self.site.getmagicwords(magicword))
                    preferred = aliases.pop(0)
                    if not aliases:
                        continue
                    split[1:] = list(
                        map(lambda x: preferred
                            if x.strip() in aliases else x, split[1:]))
                return '|'.join(split) + ']]'

            exceptions = ['nowiki', 'comment', 'math', 'pre', 'source']
            regex = re.compile(FILE_LINK_REGEX %
                               '|'.join(self.site.namespaces[6]),
                               flags=re.X)
            text = textlib.replaceExcept(text, regex, replace_magicword,
                                         exceptions)
        return text
Beispiel #50
0
 def __iter__(self):
     try:
         for entry in self.parser:
             if self.skipping:
                 if entry.title != self.xmlStart:
                     continue
                 self.skipping = False
             if not self.isTitleExcepted(entry.title) \
                     and not self.isTextExcepted(entry.text):
                 new_text = entry.text
                 for old, new in self.replacements:
                     new_text = textlib.replaceExcept(
                         new_text, old, new, self.excsInside, self.site)
                 if new_text != entry.text:
                     yield pywikibot.Page(self.site, entry.title)
     except KeyboardInterrupt:
         try:
             if not self.skipping:
                 pywikibot.output(
                     u'To resume, use "-xmlstart:%s" on the command line.'
                     % entry.title)
         except NameError:
             pass
Beispiel #51
0
    def fixArabicLetters(self, text: str) -> str:
        """Fix Arabic and Persian letters."""
        if self.site.code not in ['ckb', 'fa']:
            return text

        exceptions = [
            'file',
            'gallery',
            'hyperlink',
            'interwiki',
            'inputbox',
            # FIXME: but changes letters inside wikilinks
            # 'link',
            'math',
            'pre',
            'template',
            'timeline',
            'ref',
            'startspace',
            'syntaxhighlight',
        ]  # type: List[Union[str, Pattern[str]]]

        digits = textlib.NON_LATIN_DIGITS
        faChrs = 'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa']

        # not to let bot edits in latin content
        exceptions.append(re.compile('[^{fa}] *?"*? *?, *?[^{fa}]'
                                     .format(fa=faChrs)))
        text = textlib.replaceExcept(text, ',', '،', exceptions,
                                     site=self.site)
        if self.site.code == 'ckb':
            text = textlib.replaceExcept(text,
                                         '\u0647([.\u060c_<\\]\\s])',
                                         '\u06d5\\1', exceptions,
                                         site=self.site)
            text = textlib.replaceExcept(text, 'ه\u200c', 'ە', exceptions,
                                         site=self.site)
            text = textlib.replaceExcept(text, 'ه', 'ھ', exceptions,
                                         site=self.site)
        text = textlib.replaceExcept(text, 'ك', 'ک', exceptions,
                                     site=self.site)
        text = textlib.replaceExcept(text, '[ىي]', 'ی', exceptions,
                                     site=self.site)

        return text
Beispiel #52
0
    def removeEmptySections(self, text):
        """Cleanup empty sections."""
        exceptions = [
            'comment', 'pre', 'source', 'nowiki', 'code', 'startspace'
        ]

        skippings = ['comment']
        skip_regexes = _get_regexes(skippings, self.site)
        skip_templates = {
            'cs': ('Pahýl[ _]část', ),  # stub section
        }
        if self.site.code in skip_templates:
            for template in skip_templates[self.site.code]:
                skip_regexes.append(
                    re.compile(r'\{\{\s*%s\s*\}\}' % template, re.I))
        stripped_text = str(text)
        for reg in skip_regexes:
            stripped_text = reg.sub(r'', stripped_text)

        stripped_pattern = re.compile(
            r'\n((=+) *[^\n=]+? *\2) *\n\s*(?=(\2 *[^\n=]+? *\2))')
        pos = 0
        while True:
            match = stripped_pattern.search(stripped_text[pos:])
            if not match:
                break
            pattern = re.compile(
                r'\n{}.+?(?={})'.format(match.group(1), match.group(3)),
                re.DOTALL)
            text = textlib.replaceExcept(text,
                                         pattern,
                                         r'\n',
                                         exceptions=exceptions)
            pos = match.end()

        return text
Beispiel #53
0
    def fixArabicLetters(self, text):
        """Fix arabic and persian letters."""
        if self.site.code not in ['ckb', 'fa']:
            return text
        exceptions = [
            'gallery',
            'file',
            'hyperlink',
            'interwiki',
            # FIXME: but changes letters inside wikilinks
            # 'link',
            'math',
            'pre',
            'template',
            'timeline',
            'ref',
            'source',
            'startspace',
            'inputbox',
        ]
        # FIXME: use textlib.NON_LATIN_DIGITS
        # valid digits
        digits = {
            'ckb': u'٠١٢٣٤٥٦٧٨٩',
            'fa': u'۰۱۲۳۴۵۶۷۸۹',
        }
        faChrs = u'ءاآأإئؤبپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیةيك' + digits['fa']
        new = digits.pop(self.site.code)
        # This only works if there are only two items in digits dict
        old = digits[list(digits.keys())[0]]
        # not to let bot edits in latin content
        exceptions.append(
            re.compile(u"[^%(fa)s] *?\"*? *?, *?[^%(fa)s]" % {'fa': faChrs}))
        text = textlib.replaceExcept(text,
                                     ',',
                                     '،',
                                     exceptions,
                                     site=self.site)
        if self.site.code == 'ckb':
            text = textlib.replaceExcept(text,
                                         '\u0647([.\u060c_<\\]\\s])',
                                         '\u06d5\\1',
                                         exceptions,
                                         site=self.site)
            text = textlib.replaceExcept(text,
                                         'ه\u200c',
                                         'ە',
                                         exceptions,
                                         site=self.site)
            text = textlib.replaceExcept(text,
                                         'ه',
                                         'ھ',
                                         exceptions,
                                         site=self.site)
        text = textlib.replaceExcept(text,
                                     'ك',
                                     'ک',
                                     exceptions,
                                     site=self.site)
        text = textlib.replaceExcept(text,
                                     '[ىي]',
                                     'ی',
                                     exceptions,
                                     site=self.site)

        return text

        # FIXME: split this function into two.
        # replace persian/arabic digits
        # deactivated due to bug 55185
        for i in range(0, 10):
            text = textlib.replaceExcept(text, old[i], new[i], exceptions)
        # do not change digits in class, style and table params
        pattern = re.compile(r'\w+=(".+?"|\d+)', re.UNICODE)
        exceptions.append(pattern)
        # do not change digits inside html-tags
        pattern = re.compile(u'<[/]*?[^</]+?[/]*?>', re.UNICODE)
        exceptions.append(pattern)
        exceptions.append('table')  # exclude tables for now
        # replace digits
        for i in range(0, 10):
            text = textlib.replaceExcept(text, str(i), new[i], exceptions)
        return text
Beispiel #54
0
    def fixSyntaxSave(self, text):
        """Convert weblinks to wikilink, fix link syntax."""
        def replace_link(match):
            """Create a string to replace a single link."""
            replacement = '[[' + match.group('link')
            if match.group('title'):
                replacement += '|' + match.group('title')
            return replacement + ']]'

        exceptions = [
            'nowiki', 'comment', 'math', 'pre', 'source', 'startspace'
        ]
        # link to the wiki working on
        # Only use suffixes for article paths
        for suffix in self.site._interwiki_urls(True):
            http_url = self.site.base_url(suffix, 'http')
            if self.site.protocol() == 'http':
                https_url = None
            else:
                https_url = self.site.base_url(suffix, 'https')
            # compare strings without the protocol, if they are empty support
            # also no prefix (//en.wikipedia.org/…)
            if https_url is not None and http_url[4:] == https_url[5:]:
                urls = ['(?:https?:)?' + re.escape(http_url[5:])]
            else:
                urls = [
                    re.escape(url) for url in (http_url, https_url)
                    if url is not None
                ]
            for url in urls:
                # Only include links which don't include the separator as
                # the wikilink won't support additional parameters
                separator = '?'
                if '?' in suffix:
                    separator += '&'
                # Match first a non space in the title to prevent that multiple
                # spaces at the end without title will be matched by it
                text = textlib.replaceExcept(
                    text,
                    r'\[\[?' + url + r'(?P<link>[^' + separator + r']+?)'
                    r'(\s+(?P<title>[^\s].*?))?\s*\]\]?',
                    replace_link,
                    exceptions,
                    site=self.site)
        # external link in/starting with double brackets
        text = textlib.replaceExcept(text,
                                     r'\[\[(?P<url>https?://[^\]]+?)\]\]?',
                                     r'[\g<url>]',
                                     exceptions,
                                     site=self.site)
        # external link and description separated by a pipe, with
        # whitespace in front of the pipe, so that it is clear that
        # the dash is not a legitimate part of the URL.
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] \r\n]+?) +\| *(?P<label>[^\|\]]+?)\]',
            r'[\g<url> \g<label>]', exceptions)
        # dash in external link, where the correct end of the URL can
        # be detected from the file extension. It is very unlikely that
        # this will cause mistakes.
        extensions = [
            r'\.{0}'.format(ext)
            for ext in ['pdf', 'html?', 'php', 'aspx?', 'jsp']
        ]
        text = textlib.replaceExcept(
            text,
            r'\[(?P<url>https?://[^\|\] ]+?(' + '|'.join(extensions) + r')) *'
            r'\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', exceptions)
        return text
Beispiel #55
0
    def cleanUpLinks(self, text):
        """Tidy up wikilinks found in a string.

        This function will:
        * Replace underscores with spaces

        * Move leading and trailing spaces out of the wikilink and into the
          surrounding text

        * Convert URL-encoded characters into Unicode-encoded characters

        * Move trailing characters out of the link and make the link without
          using a pipe, if possible

        * Capitalize the article title of the link, if appropriate

        @param text: string to perform the clean-up on
        @type text: str
        @return: text with tidied wikilinks
        @rtype: str
        """

        # helper function which works on one link and either returns it
        # unmodified, or returns a replacement.
        def handleOneLink(match):
            titleWithSection = match.group('titleWithSection')
            label = match.group('label')
            trailingChars = match.group('linktrail')
            newline = match.group('newline')

            try:
                is_interwiki = self.site.isInterwikiLink(titleWithSection)
            except ValueError:  # T111513
                is_interwiki = True

            if not is_interwiki:
                # The link looks like this:
                # [[page_title|link_text]]trailing_chars
                # We only work on namespace 0 because pipes and linktrails work
                # differently for images and categories.
                page = pywikibot.Page(
                    pywikibot.Link(titleWithSection, self.site))
                try:
                    namespace = page.namespace()
                except pywikibot.InvalidTitle:
                    return match.group()
                if namespace == 0:
                    # Replace underlines by spaces, also multiple underlines
                    titleWithSection = re.sub('_+', ' ', titleWithSection)
                    # Remove double spaces
                    titleWithSection = re.sub('  +', ' ', titleWithSection)
                    # Remove unnecessary leading spaces from title,
                    # but remember if we did this because we eventually want
                    # to re-add it outside of the link later.
                    titleLength = len(titleWithSection)
                    titleWithSection = titleWithSection.lstrip()
                    hadLeadingSpaces = (len(titleWithSection) != titleLength)
                    hadTrailingSpaces = False
                    # Remove unnecessary trailing spaces from title,
                    # but remember if we did this because it may affect
                    # the linktrail and because we eventually want to
                    # re-add it outside of the link later.
                    if not trailingChars:
                        titleLength = len(titleWithSection)
                        titleWithSection = titleWithSection.rstrip()
                        hadTrailingSpaces = (len(titleWithSection) !=
                                             titleLength)

                    # Convert URL-encoded characters to unicode
                    from pywikibot.page import url2unicode
                    titleWithSection = url2unicode(titleWithSection,
                                                   encodings=self.site)

                    if titleWithSection == '':
                        # just skip empty links.
                        return match.group()

                    # Remove unnecessary initial and final spaces from label.
                    # Please note that some editors prefer spaces around pipes.
                    # (See [[en:Wikipedia:Semi-bots]]). We remove them anyway.
                    if label is not None:
                        # Remove unnecessary leading spaces from label,
                        # but remember if we did this because we want
                        # to re-add it outside of the link later.
                        labelLength = len(label)
                        label = label.lstrip()
                        hadLeadingSpaces = (len(label) != labelLength)
                        # Remove unnecessary trailing spaces from label,
                        # but remember if we did this because it affects
                        # the linktrail.
                        if not trailingChars:
                            labelLength = len(label)
                            label = label.rstrip()
                            hadTrailingSpaces = (len(label) != labelLength)
                    else:
                        label = titleWithSection
                    if trailingChars:
                        label += trailingChars

                    if self.site.siteinfo['case'] == 'first-letter':
                        firstcase_title = first_lower(titleWithSection)
                        firstcase_label = first_lower(label)
                    else:
                        firstcase_title = titleWithSection
                        firstcase_label = label

                    if firstcase_label == firstcase_title:
                        newLink = '[[%s]]' % label
                    # Check if we can create a link with trailing characters
                    # instead of a pipelink
                    elif (firstcase_label.startswith(firstcase_title) and
                          trailR.sub('', label[len(titleWithSection):]) == ''):
                        newLink = '[[%s]]%s' % (label[:len(titleWithSection)],
                                                label[len(titleWithSection):])

                    else:
                        # Try to capitalize the first letter of the title.
                        # Not useful for languages that don't capitalize nouns.
                        # TODO: Add a configuration variable for each site,
                        # which determines if the link target is written in
                        # uppercase
                        if self.site.sitename == 'wikipedia:de':
                            titleWithSection = first_upper(titleWithSection)
                        newLink = "[[%s|%s]]" % (titleWithSection, label)
                    # re-add spaces that were pulled out of the link.
                    # Examples:
                    #   text[[ title ]]text        -> text [[title]] text
                    #   text[[ title | name ]]text -> text [[title|name]] text
                    #   text[[ title |name]]text   -> text[[title|name]]text
                    #   text[[title| name]]text    -> text [[title|name]]text
                    if hadLeadingSpaces and not newline:
                        newLink = ' ' + newLink
                    if hadTrailingSpaces:
                        newLink = newLink + ' '
                    if newline:
                        newLink = newline + newLink
                    return newLink
            # don't change anything
            return match.group()

        trailR = re.compile(self.site.linktrail())
        # The regular expression which finds links. Results consist of four groups:
        # group <newline> depends whether the links starts with a new line.
        # group <titleWithSection> is the page title and section, that is,
        # everything before | or ]. It'll include the # to make life easier for us.
        # group <label> is the alternative link title between | and ].
        # group <linktrail> is the link trail after ]] which are part of the word.
        # note that the definition of 'letter' varies from language to language.
        linkR = re.compile(
            r'(?P<newline>[\n]*)\[\[(?P<titleWithSection>[^\]\|]+)'
            r'(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' +
            self.site.linktrail() + ')')

        text = textlib.replaceExcept(
            text, linkR, handleOneLink,
            ['comment', 'math', 'nowiki', 'pre', 'startspace'])
        return text
    def transferImage(self, sourceImagePage):
        """
        Download image and its description, and upload it to another site.

        @return: the filename which was used to upload the image
        """
        sourceSite = sourceImagePage.site
        url = sourceImagePage.fileUrl().encode('utf-8')
        pywikibot.output(u"URL should be: %s" % url)
        # localize the text that should be printed on the image description page
        try:
            description = sourceImagePage.get()
            # try to translate license templates
            if (sourceSite.sitename,
                    self.targetSite.sitename) in licenseTemplates:
                for old, new in licenseTemplates[(
                        sourceSite.sitename,
                        self.targetSite.sitename)].items():
                    new = '{{%s}}' % new
                    old = re.compile('{{%s}}' % old)
                    description = textlib.replaceExcept(
                        description, old, new,
                        ['comment', 'math', 'nowiki', 'pre'])

            description = i18n.twtranslate(self.targetSite,
                                           'imagetransfer-file_page_message', {
                                               'site': sourceSite,
                                               'description': description
                                           })
            description += '\n\n'
            description += sourceImagePage.getFileVersionHistoryTable()
            # add interwiki link
            if sourceSite.family == self.targetSite.family:
                description += u'\r\n\r\n{0}'.format(sourceImagePage)
        except pywikibot.NoPage:
            description = ''
            pywikibot.output(
                'Image does not exist or description page is empty.')
        except pywikibot.IsRedirectPage:
            description = ''
            pywikibot.output('Image description page is redirect.')
        else:
            bot = UploadRobot(url=url,
                              description=description,
                              targetSite=self.targetSite,
                              urlEncoding=sourceSite.encoding(),
                              keepFilename=self.keep_name,
                              verifyDescription=not self.keep_name,
                              ignoreWarning=self.ignore_warning)
            # try to upload
            targetFilename = bot.run()
            if targetFilename and self.targetSite.family.name == 'commons' and \
               self.targetSite.code == 'commons':
                # upload to Commons was successful
                reason = i18n.twtranslate(sourceSite,
                                          'imagetransfer-nowcommons_notice')
                # try to delete the original image if we have a sysop account
                if sourceSite.family.name in config.sysopnames and \
                   sourceSite.lang in config.sysopnames[sourceSite.family.name]:
                    if sourceImagePage.delete(reason):
                        return
                if sourceSite.lang in nowCommonsTemplate and \
                   sourceSite.family.name in config.usernames and \
                   sourceSite.lang in config.usernames[sourceSite.family.name]:
                    # add the nowCommons template.
                    pywikibot.output(u'Adding nowCommons template to %s' %
                                     sourceImagePage.title())
                    sourceImagePage.put(
                        sourceImagePage.get() + '\n\n' +
                        nowCommonsTemplate[sourceSite.lang] % targetFilename,
                        summary=reason)
def check_titles(site, report_page_name, replacements):
    """
    To avoid breaking links, adds page titles that will be changed to exception list
    :param site: site where the bot will run
    :param report_page_name: a page name to list of titles adds to exception
    :param replacements: dictionary of replacements
    """
    from pywikibot import textlib
    from pywikibot.tools import itergroup
    all_pages = site.allpages(namespace=0, filterredir=False, content=False)
    evaluation_progress = 0
    exceptions_dict = {}
    for titles_group in itergroup(all_pages, all_pages.query_limit):
        titles_group_t = [
            p.title(as_link=True, with_section=False) for p in titles_group
        ]
        old_titles = titles_group_t
        evaluation_progress += len(titles_group_t)
        if evaluation_progress % 20000 == 0:
            print('\r%i page titles processed' % evaluation_progress)
        old_text = ' \n '.join(titles_group_t)
        for replacement_key, replacement in replacements.items():
            replacement_exceptions = replacement.exceptions or {}
            replacement_exceptions_inside = replacement_exceptions.get(
                'inside', [])
            new_text = textlib.replaceExcept(old_text,
                                             replacement.old_regex,
                                             replacement.new,
                                             replacement_exceptions_inside,
                                             site=site)

            # replacement change valid title
            changed_titles = (
                (old_title, new_title) for old_title, new_title in zip(
                    old_titles, new_text.split(' \n '))
                if old_title != new_title and old_title != '[[%s' %
                pywikibot.tools.first_upper(new_title[2:]))  # breaks link
            # no special treat for link
            changed_titles = (
                (old_title, new_title)
                for old_title, new_title in changed_titles
                if replacement.old_regex.sub(replacement.new, ' %s ' %
                                             old_title[2:-2]) != ' %s ' %
                old_title[2:-2])
            # valid title is not disambig
            changed_titles = [
                old_title[2:-2] for old_title, new_title in changed_titles
                if not pywikibot.Page(site, old_title[2:-2]).isDisambig()
            ]
            if len(changed_titles) > 0:
                #changed_titles_exceptions = [re.compile(re.escape(title), re.U) for title in changed_titles]
                changed_titles_exceptions = [
                    re.compile(
                        '\[\[%s\|.+?\]\]|%s' %
                        (re.escape(title), re.escape(title)), re.U)
                    for title in changed_titles
                ]
                replacement_exceptions[
                    'inside'] = replacement_exceptions_inside + changed_titles_exceptions
                replacement.exceptions = replacement_exceptions
                if replacement_key not in exceptions_dict:
                    exceptions_dict[replacement_key] = []
                exceptions_dict[replacement_key] += changed_titles

    exceptions_dict = OrderedDict(
        sorted((int(k), v) for k, v in exceptions_dict.items()))
    report_page = pywikibot.Page(site, report_page_name)
    exception_report = ''
    for replace_key, replaced_titles in exceptions_dict.items():
        exception_report += '\n* %i\n%s' % (replace_key, '\n'.join(
            ['** [[%s]]' % t for t in replaced_titles]))
    report_page.put(exception_report, summary='עדכון')
Beispiel #58
0
    def commonsfiledesc(self, text):
        """
        Clean up file descriptions on the Wikimedia Commons.

        It is working according to [1] and works only on pages in the file
        namespace on the Wikimedia Commons.

        [1]:
        https://commons.wikimedia.org/wiki/Commons:Tools/pywiki_file_description_cleanup
        """
        if self.site.sitename != 'commons:commons' or self.namespace == 6:
            return
        # section headers to {{int:}} versions
        exceptions = ['comment', 'includeonly', 'math', 'noinclude', 'nowiki',
                      'pre', 'source', 'ref', 'timeline']
        text = textlib.replaceExcept(text,
                                     r"([\r\n]|^)\=\= *Summary *\=\=",
                                     r"\1== {{int:filedesc}} ==",
                                     exceptions, True)
        text = textlib.replaceExcept(
            text,
            r"([\r\n])\=\= *\[\[Commons:Copyright tags\|Licensing\]\]: *\=\=",
            r"\1== {{int:license-header}} ==", exceptions, True)
        text = textlib.replaceExcept(
            text,
            r'([\r\n])'
            r'\=\= *(Licensing|License information|{{int:license}}) *\=\=',
            r"\1== {{int:license-header}} ==", exceptions, True)

        # frequent field values to {{int:}} versions
        text = textlib.replaceExcept(
            text,
            r'([\r\n]\|[Ss]ource *\= *)'
            r'(?:[Oo]wn work by uploader|[Oo]wn work|[Ee]igene [Aa]rbeit) *'
            r'([\r\n])',
            r'\1{{own}}\2', exceptions, True)
        text = textlib.replaceExcept(
            text,
            r'(\| *Permission *\=) *(?:[Ss]ee below|[Ss]iehe unten) *([\r\n])',
            r'\1\2', exceptions, True)

        # added to transwikied pages
        text = textlib.replaceExcept(text, r'__NOTOC__', '', exceptions, True)

        # tracker element for js upload form
        text = textlib.replaceExcept(
            text,
            r'<!-- *{{ImageUpload\|(?:full|basic)}} *-->',
            '', exceptions[1:], True)
        text = textlib.replaceExcept(text, r'{{ImageUpload\|(?:basic|full)}}',
                                     '', exceptions, True)

        # duplicated section headers
        text = textlib.replaceExcept(
            text,
            r'([\r\n]|^)\=\= *{{int:filedesc}} *\=\=(?:[\r\n ]*)\=\= *'
            r'{{int:filedesc}} *\=\=',
            r'\1== {{int:filedesc}} ==', exceptions, True)
        text = textlib.replaceExcept(
            text,
            r'([\r\n]|^)\=\= *{{int:license-header}} *\=\=(?:[\r\n ]*)'
            r'\=\= *{{int:license-header}} *\=\=',
            r'\1== {{int:license-header}} ==', exceptions, True)
        return text
Beispiel #59
0
    def transfer_image(self, sourceImagePage):
        """
        Download image and its description, and upload it to another site.

        @return: the filename which was used to upload the image
        """
        sourceSite = sourceImagePage.site
        pywikibot.output(
            '\n>>> Transfer {source} from {source.site} to {target}\n'.format(
                source=sourceImagePage, target=self.opt.target))
        url = sourceImagePage.get_file_url()
        pywikibot.output('URL should be: ' + url)
        # localize the text that should be printed on image description page
        try:
            description = sourceImagePage.get()
            # try to translate license templates
            if (sourceSite.sitename,
                    self.opt.target.sitename) in licenseTemplates:
                for old, new in licenseTemplates[(
                        sourceSite.sitename,
                        self.opt.target.sitename)].items():
                    new = '{{%s}}' % new
                    old = re.compile('{{%s}}' % old)
                    description = textlib.replaceExcept(
                        description, old, new,
                        ['comment', 'math', 'nowiki', 'pre'])

            description = i18n.twtranslate(self.opt.target,
                                           'imagetransfer-file_page_message', {
                                               'site': sourceSite,
                                               'description': description
                                           })
            description += '\n\n'
            description += sourceImagePage.getFileVersionHistoryTable()
            # add interwiki link
            if sourceSite.family == self.opt.target.family:
                description += '\n\n{}'.format(sourceImagePage)
        except NoPageError:
            pywikibot.output(
                'Image does not exist or description page is empty.')
        except IsRedirectPageError:
            pywikibot.output('Image description page is redirect.')
        else:
            bot = UploadRobot(url=url,
                              description=description,
                              target_site=self.opt.target,
                              url_encoding=sourceSite.encoding(),
                              keep_filename=self.opt.keepname,
                              verify_description=not self.opt.keepname,
                              ignore_warning=self.opt.ignore_warning)

            # try to upload
            if bot.skip_run():
                return
            target_filename = bot.upload_file(url)

            if target_filename \
               and self.opt.target.sitename == 'commons:commons':
                # upload to Commons was successful
                reason = i18n.twtranslate(sourceSite,
                                          'imagetransfer-nowcommons_notice')
                # try to delete the original image if we have a sysop account
                if sourceSite.has_right('delete'):
                    if sourceImagePage.delete(reason):
                        return
                if sourceSite.lang in nowCommonsTemplate \
                   and sourceSite.family.name in config.usernames \
                   and sourceSite.lang in \
                   config.usernames[sourceSite.family.name]:
                    # add the nowCommons template.
                    pywikibot.output('Adding nowCommons template to ' +
                                     sourceImagePage.title())
                    sourceImagePage.put(
                        sourceImagePage.get() + '\n\n' +
                        nowCommonsTemplate[sourceSite.lang] % target_filename,
                        summary=reason)
Beispiel #60
0
    def process(self, text):
        """Process the page."""
        # keys are ref groups
        # values are a dict where :
        #   keys are ref content
        #   values are [name, [list of full ref matches],
        #               quoted, need_to_change]
        found_refs = {}
        found_ref_names = {}
        # Replace key by [value, quoted]
        named_repl = {}

        for match in self.REFS.finditer(text):
            content = match.group('content')
            if not content.strip():
                continue

            params = match.group('params')
            group = self.GROUPS.match(params)
            if group not in found_refs:
                found_refs[group] = {}

            groupdict = found_refs[group]
            if content in groupdict:
                v = groupdict[content]
                v[1].append(match.group())
            else:
                v = [None, [match.group()], False, False]

            found = self.NAMES.match(params)
            if found:
                quoted = found.group('quote') in ['"', "'"]
                name = found.group('name')
                if v[0]:
                    if v[0] != name:
                        named_repl[name] = [v[0], v[2]]
                else:
                    # First name associated with this content
                    if name == 'population':
                        pywikibot.output(content)
                    if name not in found_ref_names:
                        # first time ever we meet this name
                        if name == 'population':
                            pywikibot.output('in')
                        v[2] = quoted
                        v[0] = name
                    else:
                        # if has_key, means that this name is used
                        # with another content. We'll need to change it
                        v[3] = True
                found_ref_names[name] = 1
            groupdict[content] = v

        used_numbers = set()
        for name in found_ref_names:
            number = removeprefix(name, self.autogen)
            with suppress(ValueError):
                used_numbers.add(int(number))

        # iterator to give the next free number
        free_number = iter({str(i) for i in range(1, 1000)  # should be enough
                            if i not in used_numbers})

        for (g, d) in found_refs.items():
            group = ''
            if g:
                group = 'group="{}" '.format(group)

            for (k, v) in d.items():
                if len(v[1]) == 1 and not v[3]:
                    continue

                name = v[0]
                if not name:
                    name = '"{}{}"'.format(self.autogen, next(free_number))
                elif v[2]:
                    name = '"{}"'.format(name)

                named = '<ref {}name={}>{}</ref>'.format(group, name, k)
                text = text.replace(v[1][0], named, 1)

                # make sure that the first (named ref) is not
                # removed later :
                pos = text.index(named) + len(named)
                header = text[:pos]
                end = text[pos:]

                unnamed = '<ref {}name={} />'.format(group, name)
                for ref in v[1][1:]:
                    # Don't replace inside templates (T266411)
                    end = replaceExcept(end, re.escape(ref), unnamed,
                                        exceptions=['template'])
                text = header + end

        for (k, v) in named_repl.items():
            # TODO : Support ref groups
            name = v[0]
            if v[1]:
                name = '"{}"'.format(name)

            text = re.sub(
                r'<ref name\s*=\s*(?P<quote>["\']?)\s*{}\s*(?P=quote)\s*/>'
                .format(k),
                '<ref name={} />'.format(name), text)
        return text