Esempio n. 1
0
def weblinksIn(text, withoutBracketed=False, onlyBracketed=False):
    text = textlib.removeDisabledParts(text)

    # MediaWiki parses templates before parsing external links. Thus, there
    # might be a | or a } directly after a URL which does not belong to
    # the URL itself.

    # First, remove the curly braces of inner templates:
    nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}')
    while nestedTemplateR.search(text):
        text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text)

    # Then blow up the templates with spaces so that the | and }} will not
    # be regarded as part of the link:.
    templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}',
                                     re.DOTALL)
    while templateWithParamsR.search(text):
        text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)

    # Add <blank> at the end of a template
    # URL as last param of multiline template would not be correct
    text = text.replace('}}', ' }}')

    # Remove HTML comments in URLs as well as URLs in HTML comments.
    # Also remove text inside nowiki links etc.
    text = textlib.removeDisabledParts(text)
    linkR = textlib.compileLinkR(withoutBracketed, onlyBracketed)
    for m in linkR.finditer(text):
        if m.group('url'):
            yield m.group('url')
        else:
            yield m.group('urlb')
Esempio n. 2
0
def weblinksIn(text, withoutBracketed=False, onlyBracketed=False):
    text = textlib.removeDisabledParts(text)

    # MediaWiki parses templates before parsing external links. Thus, there
    # might be a | or a } directly after a URL which does not belong to
    # the URL itself.

    # First, remove the curly braces of inner templates:
    nestedTemplateR = re.compile(r'{{([^}]*?){{(.*?)}}(.*?)}}')
    while nestedTemplateR.search(text):
        text = nestedTemplateR.sub(r'{{\1 \2 \3}}', text)

    # Then blow up the templates with spaces so that the | and }} will not
    # be regarded as part of the link:.
    templateWithParamsR = re.compile(r'{{([^}]*?[^ ])\|([^ ][^}]*?)}}',
                                     re.DOTALL)
    while templateWithParamsR.search(text):
        text = templateWithParamsR.sub(r'{{ \1 | \2 }}', text)

    # Add <blank> at the end of a template
    # URL as last param of multiline template would not be correct
    text = text.replace('}}', ' }}')

    # Remove HTML comments in URLs as well as URLs in HTML comments.
    # Also remove text inside nowiki links etc.
    text = textlib.removeDisabledParts(text)
    linkR = textlib.compileLinkR(withoutBracketed, onlyBracketed)
    for m in linkR.finditer(text):
        if m.group('url'):
            yield m.group('url')
        else:
            yield m.group('urlb')
Esempio n. 3
0
    def loadTypos(self):
        pywikibot.output('Loading typo rules')
        self.typoRules = []

        if self.typos_page_name is None:
            self.typos_page_name = 'Wikipedie:WPCleaner/Typo'
        typos_page = pywikibot.Page(self.site, self.typos_page_name)
        if not typos_page.exists():
            # todo: feedback
            return

        text = textlib.removeDisabledParts(
            typos_page.text, include=['nowiki'], site=self.site)
        load_all = self.load_all is True
        for template, fielddict in textlib.extract_templates_and_params(
                text, remove_disabled_parts=False, strip=False):
            if template.lower() == 'typo':
                try:
                    rule = TypoRule.newFromParameters(fielddict, self.site)
                except IncompleteTypoRuleException as exc:
                    pywikibot.warning(exc.message)  # pwb.exception?
                except InvalidExpressionException as exc:
                    if 'fixed-width' not in exc.message:
                        pywikibot.warning('Invalid %s %s: %s' % (
                            exc.aspect, fielddict['1'], exc.message))
                else:
                    rule.id = self.top_id
                    # fixme: cvar or ivar?
                    self.top_id += 1
                    if load_all or not rule.needs_decision():
                        self.typoRules.append(rule)

        pywikibot.output('%d typo rules loaded' % len(self.typoRules))
        return self.typoRules
Esempio n. 4
0
    def find_discussion(self, category: pywikibot.Category) -> 'CfdPage':
        """
        Return the relevant discussion.

        @param category: The category being discussed
        """
        if self.section():
            return self
        text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site)
        wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
        for section in wikicode.get_sections(levels=[4]):
            heading = section.filter_headings()[0]
            section_title = str(heading.title).strip()
            discussion = self.__class__(
                self.site, '{}#{}'.format(self.title(), section_title))
            if category.title() == section_title:
                return discussion
            # Split approximately into close, nom, and others.
            parts = str(section).split('(UTC)')
            if len(parts) < 3:
                continue
            # Parse the nom for category links.
            nom = mwparserfromhell.parse(parts[1], skip_style_tags=True)
            for node in nom.ifilter():
                page = self._cat_from_node(node)
                if page and category == page:
                    return discussion
        return self
Esempio n. 5
0
 def find_discussion(self, category):
     """Find the section with the relevant discussion."""
     if self.section():
         return self.title(as_link=True)
     text = removeDisabledParts(self.text, site=self.site)
     wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
     for section in wikicode.get_sections(levels=[4]):
         heading = section.filter(forcetype=Heading)[0]
         section_title = str(heading.title).strip()
         discussion = '[[{}#{}]]'.format(self.title(), section_title)
         if category.title() == section_title:
             return discussion
         # Split approximately into close, nom, and others
         parts = str(section).split('(UTC)')
         if len(parts) < 3:
             continue
         # Parse the nom for links
         for wikilink in pywikibot.link_regex.finditer(parts[1]):
             title = wikilink.group('title').strip().split('#')[0]
             if not title:
                 continue
             title = pywikibot.Page(self.site, title).title()
             if category.title() == title:
                 return discussion
     return self.title(as_link=True)
Esempio n. 6
0
 def __iter__(self):
     from pywikibot import xmlreader
     dump = xmlreader.XmlDump(self.xmlFilename)
     for entry in dump.parse():
         text = textlib.removeDisabledParts(entry.text)
         if self.refR.search(text) and not self.referencesR.search(text):
             yield pywikibot.Page(pywikibot.Site(), entry.title)
Esempio n. 7
0
    def replace_gallery_files(
            self, wikicode: mwparserfromhell.wikicode.Wikicode) -> None:
        """
        Replace files in <gallery>.

        :param wikicode: Parsed wikitext
        """
        for tag in wikicode.ifilter_tags():
            if tag.tag.lower() != "gallery":
                continue
            lines = str(tag.contents).splitlines()
            for i, line in enumerate(lines):
                title, sep, caption = removeDisabledParts(line).partition("|")
                if not title:
                    continue
                try:
                    current_icon = BSiconPage(self.current_page.site, title)
                    current_icon.title()
                except (pywikibot.exceptions.Error, ValueError):
                    continue
                new_icon = self.opt.bsicons_map.get(current_icon, None)
                if new_icon:
                    lines[i] = f"{new_icon.title()}{sep}{caption}"
                    self.current_page.replacements.add(
                        Replacement(current_icon, new_icon))
            if self.current_page.replacements:
                tag.contents = "\n".join(lines) + "\n"
Esempio n. 8
0
 def lacksReferences(self, text):
     """Check whether or not the page is lacking a references tag."""
     oldTextCleaned = textlib.removeDisabledParts(text)
     if self.referencesR.search(oldTextCleaned) or \
        self.referencesTagR.search(oldTextCleaned):
         if self.getOption('verbose'):
             pywikibot.output(
                 u'No changes necessary: references tag found.')
         return False
     elif self.referencesTemplates:
         templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')'
         if re.search(templateR, oldTextCleaned,
                      re.IGNORECASE | re.UNICODE):
             if self.getOption('verbose'):
                 pywikibot.output(
                     u'No changes necessary: references template found.')
             return False
     if not self.refR.search(oldTextCleaned):
         if self.getOption('verbose'):
             pywikibot.output(u'No changes necessary: no ref tags found.')
         return False
     else:
         if self.getOption('verbose'):
             pywikibot.output(u'Found ref without references.')
         return True
Esempio n. 9
0
 def treat_page(self):
     """Process one page."""
     self.check_disabled()
     target = self.current_page.getCategoryRedirectTarget()
     seen = {self.current_page, target}
     while target.isCategoryRedirect():
         target = target.getCategoryRedirectTarget()
         if target in seen:
             pywikibot.error(
                 'Skipping {} due to possible circular redirect at {}.'.
                 format(self.current_page, target))
             return
         seen.add(target)
     wikicode = mwparserfromhell.parse(self.current_page.text,
                                       skip_style_tags=True)
     for tpl in wikicode.ifilter_templates():
         try:
             template = pywikibot.Page(
                 self.site,
                 removeDisabledParts(str(tpl.name), site=self.site),
                 ns=10,
             )
             template.title()
         except pywikibot.InvalidTitle:
             continue
         if template in self.templates:
             tpl.add('1', target.title(with_ns=False))
             break
     self.put_current(str(wikicode), summary=self.getOption('summary'))
Esempio n. 10
0
    def lacksReferences(self, text) -> bool:
        """Check whether or not the page is lacking a references tag."""
        oldTextCleaned = textlib.removeDisabledParts(text)
        if self.referencesR.search(oldTextCleaned) \
           or self.referencesTagR.search(oldTextCleaned):
            if self.opt.verbose:
                pywikibot.output('No changes necessary: references tag found.')
            return False

        if self.referencesTemplates:
            templateR = '{{(' + '|'.join(self.referencesTemplates) + ')'
            if re.search(templateR, oldTextCleaned, re.IGNORECASE):
                if self.opt.verbose:
                    pywikibot.output(
                        'No changes necessary: references template found.')
                return False

        if not self.refR.search(oldTextCleaned):
            if self.opt.verbose:
                pywikibot.output('No changes necessary: no ref tags found.')
            return False

        if self.opt.verbose:
            pywikibot.output('Found ref without references.')
        return True
Esempio n. 11
0
    def get_action(self, category: pywikibot.Category) -> str:
        """
        Return the discussion action.

        @param category: The category being discussed
        """
        if not self.section():
            return ''
        text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site)
        wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
        for section in wikicode.get_sections(levels=[4]):
            heading = section.filter_headings()[0]
            if str(heading.title).strip() == self.section():
                break
        else:
            section = None  # Trick pylint.
            return ''
        # Parse the discussion for category links and action.
        for line in str(section).splitlines():
            found = False
            line_wc = mwparserfromhell.parse(line, skip_style_tags=True)
            for node in line_wc.ifilter():
                page = self._cat_from_node(node)
                if page and category == page:
                    found = True
                    break
            matches = re.findall(r"'''Propose (.+?)'''", line)
            if found and matches:
                return matches[0]
        return ''
Esempio n. 12
0
    def treat(self, page):
        # get all linkedPages
        # check for disambigs
        linksR = re.compile('\[\[(?P<short>[^\]]*)\]\] *\|\| *\[\[(?P<long>[^\]]*)\]\]')
        res = []
        counter = 0
        if self.opt.test:
            pywikibot.output('Treat(%s)' % page.title(as_link=True))
        for p in linksR.finditer(textlib.removeDisabledParts(page.text)):
            counter += 1
            longn = p.group('long')
            shortn = p.group('short')
            if self.opt.testlinks:
                pywikibot.output('[%s][#%i] S:%s L:%s' % (
                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter, shortn, longn))
            rpl = pywikibot.Page(pywikibot.Site(), longn)
            rplcount = len(list(rpl.getReferences(namespaces=0)))
            if self.opt.testlinks:
                pywikibot.output('L:%s #%i In %s checking:%s - referenced by %i' %
                                 (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter,
                                  page.title(as_link=True), rpl.title(as_link=True), rplcount))
            rps = pywikibot.Page(pywikibot.Site(), shortn)
            rpscount = len(list(rps.getReferences(namespaces=0)))
            if self.opt.testlinks:
                pywikibot.output('S:%s #%i In %s checking:%s - referenced by %i' %
                                 (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), counter,
                                  page.title(as_link=True), rps.title(as_link=True), rpscount))

            res.append({"long": longn, "refl": rplcount, "short": shortn, "refs": rpscount})

        print(res)
        return res
Esempio n. 13
0
 def __iter__(self):
     import xmlreader
     dump = xmlreader.XmlDump(self.xmlFilename)
     for entry in dump.parse():
         text = textlib.removeDisabledParts(entry.text)
         if self.refR.search(text) and not self.referencesR.search(text):
             yield pywikibot.Page(pywikibot.Site(), entry.title)
Esempio n. 14
0
	def getWordCount(self, text):
		text = textlib.removeDisabledParts(text)
		text = textlib.removeHTMLParts(text)
		text = textlib.removeLanguageLinks(text)
		text = textlib.removeCategoryLinks(text)
		word_list = re.findall(r"[\w']+", text)

		return len(word_list)
Esempio n. 15
0
 def checkUserPage(self,page):
     text = page.text
     if self.catsPresent(textlib.removeDisabledParts(text)):
         text = textlib.replaceExcept(text, ur'\[\[kategoria', '[[:Kategoria', ['comment','pre','nowiki'], caseInsensitive=True)
         #text = re.sub('\[\[kategoria', '[[:Kategoria', text, flags=re.I)
         pywikibot.output(u'Kategorie usunięte')
         page.text = text
         #page.save(summary=u'Bot usuwa stronę użytkownika z kategorii', apply_cosmetic_changes=False)
     else:
         pywikibot.output(u'Strona użytkownika OK')
     return
 def get_pages_with_descriptions(self, text):
     tags = {'category', 'comment', 'file', 'header', 'hyperlink',
             'interwiki', 'nowiki', 'pre', 'ref', 'source', 'timeline',
             'template'}
     text = textlib.removeDisabledParts(text, tags, site=self.site)
     data = {}
     for match in self.regex.finditer(text):
         title, desc = match.groups()
         page = pywikibot.Page(self.site, title)
         data[page] = self.parse_description(desc)
     return data
Esempio n. 17
0
def check(text, languages):
    tags = ['comments', 'nowiki', 'pre', 'source']
    text = textlib.removeDisabledParts(text, tags)
    interwikiR = re.compile(r'\[\[([a-zA-Z\-]+)\s?:([^\[\]\n]*)\]\]')
    for lang, pagetitle in interwikiR.findall(text):
        lang = lang.lower()
        # Check if it really is in fact an interwiki link to a known
        # language, or if it's e.g. a category tag or an internal link
        if lang in languages:
            return True
    return False
Esempio n. 18
0
 def treat(self, page):
     match = self.current_rule.find.search(page.text)
     if not match:
         return
     text = textlib.removeDisabledParts(page.text,
                                        TypoRule.exceptions,
                                        site=self.site)
     match = self.current_rule.find.search(text)
     if match:
         text = self.pattern.format(page.title(as_link=True),
                                    match.group(0))
         pywikibot.stdout(text)
         self.data.append(text)
Esempio n. 19
0
def processArticle(page):
	text = page.get()
	text = textlib.removeDisabledParts(text)
	# pywikibot.output(u'Working on "%s"' % title)
	global codeRegexp
	global templateRegexp
	result  = re.findall(codeRegexp, text)
	template = re.findall(templateRegexp, text)
	if len(result) > 0 and len(template) == 0:
		msg = u"* [[%s]]: " % page.title()
		for res in result:
			msg += str(res)
		log(msg)
		pywikibot.output(msg)
Esempio n. 20
0
 def lacksReferences(self, text):
     """Check whether or not the page is lacking a references tag."""
     oldTextCleaned = textlib.removeDisabledParts(text)
     if self.referencesR.search(oldTextCleaned) or \
        self.referencesTagR.search(oldTextCleaned):
         return False
     elif self.referencesTemplates:
         templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')'
         if re.search(templateR, oldTextCleaned,
                      re.IGNORECASE | re.UNICODE):
             return False
     if not self.refR.search(oldTextCleaned):
         return False
     else:
         return True
Esempio n. 21
0
    def treat_property_and_talk(self, prop, page):
        self.current_talk_page = page
        # todo: skip sandbox properties
        # todo: removeDisabledParts now?
        code = mwparserfromhell.parse(page.text, skip_style_tags=True)
        for template in code.ifilter_templates():
            if not template.name.matches(self.template_metadata):
                continue
            params = OrderedDict()
            for param in template.params:
                params[str(param.name).strip()] = str(param.value).strip()
            break
        else:
            pywikibot.output('Template "{}" not found'.format(
                self.template_metadata))
            return

        keys = set(self.func_dict.keys()) & set(params.keys())
        # formatter URL must go before example
        if {'formatter URL', 'example'} <= keys:
            keys.remove('formatter URL')
            keys = ['formatter URL'] + list(keys)

        clear_params = []
        for key in keys:
            param = textlib.removeDisabledParts(params[key])
            if param == '-':
                continue
            if param != '':
                pywikibot.output('Found param "{}"'.format(key))
                try:
                    remove = self.func_dict[key](param)
                except pywikibot.data.api.APIError as exc:
                    remove = False
                if remove:
                    clear_params.append(key)
        if self.getOption('importonly'):
            return

        for par in clear_params:
            template.remove(par, keep_field=True)
        for par in set(params.keys()) & set(self.obsolete_params):
            template.remove(par)

        self.current_page = self.current_talk_page
        self.put_current(str(code),
                         show_diff=True,
                         summary='removing migrated/obsolete parameters')
Esempio n. 22
0
    def treat_property_and_talk(self, prop, page):
        self.current_talk_page = page
        # todo: skip sandbox properties
        # todo: removeDisabledParts now?
        code = mwparserfromhell.parse(page.text, skip_style_tags=True)
        for template in code.ifilter_templates():
            if not template.name.matches(self.template_metadata):
                continue
            params = OrderedDict()
            for param in template.params:
                params[str(param.name).strip()] = str(param.value).strip()
            break
        else:
            pywikibot.output('Template "{}" not found'.format(
                self.template_metadata))
            return

        keys = set(self.func_dict.keys()) & set(params.keys())
        # formatter URL must go before example
        if {'formatter URL', 'example'} <= keys:
            keys.remove('formatter URL')
            keys = ['formatter URL'] + list(keys)

        clear_params = []
        for key in keys:
            param = textlib.removeDisabledParts(params[key])
            if param == '-':
                continue
            if param != '':
                pywikibot.output('Found param "{}"'.format(key))
                try:
                    remove = self.func_dict[key](param)
                except pywikibot.data.api.APIError as exc:
                    remove = False
                if remove:
                    clear_params.append(key)
        if self.getOption('importonly'):
            return

        for par in clear_params:
            template.remove(par, keep_field=True)
        for par in set(params.keys()) & set(self.obsolete_params):
            template.remove(par)

        self.current_page = self.current_talk_page
        self.put_current(str(code), show_diff=True,
                         summary='removing migrated/obsolete parameters')
Esempio n. 23
0
def parse_page(page):
    """Parse a CFD working page."""
    text = removeDisabledParts(page.text, site=page.site)
    wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
    for section in wikicode.get_sections(flat=True, include_lead=False):
        heading = section.filter(forcetype=Heading)[0]
        section_title = str(heading.title).lower()
        print(section_title)
        if 'move' in section_title:
            mode = 'move'
            edit_summary = 'Moving {old_cat} to {new_cats} per {cfd}'
        elif 'empty' in section_title:
            mode = 'empty'
            edit_summary = 'Removing {old_cat} per {cfd}'
        else:
            continue
        parse_section(section, page.site, mode, edit_summary)
Esempio n. 24
0
 def parse(self) -> None:
     """Parse the page."""
     text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site)
     wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
     for section in wikicode.get_sections(flat=True, include_lead=False):
         heading = section.filter_headings()[0]
         section_title = str(heading.title).lower()
         for mode in self.MODES:
             if mode in section_title:
                 self.mode = mode
                 break
         else:
             continue
         try:
             self._parse_section(str(section))
         except (ValueError, pywikibot.Error):
             pywikibot.exception(tb=True)
     self._check_run()
Esempio n. 25
0
    def sectionList(self,page): 
        sections = []
        sectionR = re.compile(ur'(?im)^=+(?P<section>[^<]*?)(<ref.*?)?=+$')

        text = page.text

        # expand templates
        etext = page.expand_text()
        etext = textlib.removeDisabledParts(etext)

        #if self.getOption('test'):
        #    pywikibot.output(etext)
        for s in sectionR.finditer(etext):
            if self.getOption('test'):
                pywikibot.output(u'>>>%s<<<' % s.group('section').strip())
            sections.append(s.group('section').strip())

        return(sections)
 def parse_description(self, text):
     desc = textlib.removeDisabledParts(text, [
         'comment', 'file', 'nowiki', 'template', self.FORMATTING_REGEX,
         self.REF_REGEX
     ])
     desc = LINK_REGEX.sub(self.handle_link, desc)
     desc = desc.replace('&nbsp;', ' ').strip()
     desc = re.sub(r' *\([^)]+\)$', '', desc)
     desc = desc.partition(';')[0]
     desc = re.sub(r'^.*\) [-–] +', '', desc)
     desc = re.sub(r'^\([^)]+\) +', '', desc)
     while ' ' * 2 in desc:
         desc = desc.replace(' ' * 2, ' ')
     if re.search('[^IVX]\.$', desc) or desc.endswith(tuple(',:')):
         desc = desc[:-1].rstrip()
     if desc.startswith(('a ', 'an ')):
         desc = desc.partition(' ')[2]
     return desc
Esempio n. 27
0
 def get_result(self) -> str:
     """Return the discussion result."""
     if not self.section():
         return ''
     text = removeDisabledParts(self.text, tags=EXCEPTIONS, site=self.site)
     wikicode = mwparserfromhell.parse(text, skip_style_tags=True)
     for section in wikicode.get_sections(levels=[4]):
         heading = section.filter_headings()[0]
         if str(heading.title).strip() == self.section():
             break
     else:
         section = None  # Trick pylint.
         return ''
     for line in str(section).splitlines():
         matches = re.findall(
             r"''The result of the discussion was:''\s+'''(.+?)'''", line)
         if matches:
             return matches[0]
     return ''
Esempio n. 28
0
    def replace_file_links(self, text: str) -> str:
        """
        Return text with file links replaced.

        :param text: Article text
        """
        assert self.site_config is not None
        for match in self.site_config.file_regex.finditer(
                removeDisabledParts(text)):
            try:
                current_icon = BSiconPage(self.current_page.site,
                                          match.group("filename"))
                current_icon.title()
            except (pywikibot.exceptions.Error, ValueError):
                continue
            new_icon = self.opt.bsicons_map.get(current_icon, None)
            if new_icon:
                text = text.replace(match.group("filename"),
                                    new_icon.title(with_ns=False))
                self.current_page.replacements.add(
                    Replacement(current_icon, new_icon))
        return text
Esempio n. 29
0
 def treat(self, page):
     #get all linkedPages
     # check for disambigs
     linkR = re.compile(
         ur'\[\[(?P<title>.*?)(#(?P<section>.*?))?(\|(?P<label>.*?))?\]\]')
     counter = 0
     reqcounter = 0
     checkedpages = []
     for p in linkR.finditer(textlib.removeDisabledParts(page.text)):
         counter += 1
         t = p.group('title')
         if t in checkedpages or t == '':
             continue
         try:
             rp = pywikibot.Page(pywikibot.Site(), t)
             if not rp.namespace() == 0:
                 continue
             if self.getOption('testlinks'):
                 pywikibot.output(
                     u'%s #%i (%i) In %s checking:%s' %
                     (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                      counter, reqcounter, page.title(asLink=True),
                      rp.title(asLink=True)))
             if not rp.exists():
                 reqcounter += 1
                 self.addResult(page.title(), rp.title())
             checkedpages.append(t)
         except KeyboardInterrupt:
             pywikibot.output(
                 'PICKLING at %s' %
                 datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
             with open('masti/reqlinks.dat', 'wb') as f:
                 pickle.dump(self.results,
                             f,
                             protocol=config.pickle_protocol)
             return ('STOP')
         except:
             continue
     return (reqcounter)
Esempio n. 30
0
 def lacksReferences(self, text):
     """Check whether or not the page is lacking a references tag."""
     oldTextCleaned = textlib.removeDisabledParts(text)
     if self.referencesR.search(oldTextCleaned) or \
        self.referencesTagR.search(oldTextCleaned):
         if self.getOption('verbose'):
             pywikibot.output(u'No changes necessary: references tag found.')
         return False
     elif self.referencesTemplates:
         templateR = u'{{(' + u'|'.join(self.referencesTemplates) + ')'
         if re.search(templateR, oldTextCleaned, re.IGNORECASE | re.UNICODE):
             if self.getOption('verbose'):
                 pywikibot.output(
                     u'No changes necessary: references template found.')
             return False
     if not self.refR.search(oldTextCleaned):
         if self.getOption('verbose'):
             pywikibot.output(u'No changes necessary: no ref tags found.')
         return False
     else:
         if self.getOption('verbose'):
             pywikibot.output(u'Found ref without references.')
         return True
Esempio n. 31
0
    def replace_template_files(
            self, wikicode: mwparserfromhell.wikicode.Wikicode) -> None:
        """
        Replace files in templates.

        :param wikicode: Parsed wikitext
        """
        assert self.site_config is not None
        for tpl in wikicode.ifilter_templates():
            try:
                template = Page(
                    self.current_page.site,
                    removeDisabledParts(str(tpl.name)),
                    ns=self.current_page.site.namespaces.TEMPLATE,
                )
                template.title()
            except (pywikibot.exceptions.Error, ValueError):
                continue
            if template in self.site_config.routemap_templates:
                self._replace_routemap_files(tpl)
            elif template in self.site_config.railway_track_templates:
                self._replace_rt_template_files(tpl)
            else:
                self._replace_bs_template_files(tpl, template)
Esempio n. 32
0
 def treat_page(self) -> None:
     """Process one page."""
     self.check_disabled()
     try:
         errors = self.validate_svg()
     except (AssertionError, RequestException, RuntimeError):
         pywikibot.exception()
         return
     if errors:
         n_errors = len(errors)
         new_tpl = Template('Invalid SVG')
         new_tpl.add('1', n_errors)
         summary = 'W3C invalid SVG: {} error{}'.format(
             n_errors, 's' if n_errors > 1 else '')
     else:
         new_tpl = Template('Valid SVG')
         summary = 'W3C valid SVG'
     wikicode = mwparserfromhell.parse(self.current_page.text,
                                       skip_style_tags=True)
     for tpl in wikicode.ifilter_templates():
         try:
             template = pywikibot.Page(
                 self.site,
                 removeDisabledParts(str(tpl.name), site=self.site).strip(),
                 ns=10,
             )
             template.title()
         except pywikibot.InvalidTitle:
             continue
         if template in self.templates:
             wikicode.replace(tpl, new_tpl)
             break
     else:
         wikicode.insert(0, '\n')
         wikicode.insert(0, new_tpl)
     self.put_current(str(wikicode), summary=summary, minor=not errors)
Esempio n. 33
0
    def treat(self, page):
        """
        Returns page title if param 'text' not in page
        """

        if self.getOption('wikipedia'):
            resultR = re.compile(
                ur'(?i)(?P<result>https?://(?P<lang>[^\.]*?)\.(?P<project>wikipedia)\.org/wiki/[^\s\|<\]\}]*)'
            )
        else:
            resultR = re.compile(
                ur'(?i)(?P<result>https?://(?P<lang>[^\.]*?)\.(?P<project>wikipedia|wikisource|wiktionary|wikivoyage)\.org/wiki/[^\s\|<\]\}]*)'
            )
        # allowed filtypes: svg, png, jpeg, tiff, gif, xcf
        imageR = re.compile(ur'(?i).*\.(svg|png|jpeg|jpg|tiff|tif|gif|xcf)$')

        source = textlib.removeDisabledParts(page.text)

        #return all found results
        resultslist = []
        found = False
        for r in re.finditer(resultR, source):
            if self.getOption('test'):
                pywikibot.output(u'R:%s' % r.group('result'))
            img = imageR.search(r.group('result'))
            if not img:
                resultslist.append({
                    'link': r.group('result'),
                    'lang': r.group('lang'),
                    'project': r.group('project')
                })
                found = True
        if found:
            return ({'page': page.title(), 'links': resultslist})
        else:
            return (None)
    def find_and_replace(self, text, init):
        new_params = []
        old_params = []
        unknown_params = []
        removed_params = []
        changed = False
        for template, fielddict in textlib.extract_templates_and_params(
                text, remove_disabled_parts=False, strip=False):
            if self.normalize(template) not in (self.template,
                                                self.new_template):
                continue

            changed = self.normalize(template) != self.new_template
            start_match = re.search(r'\{\{\s*((%s)\s*:\s*)?%s\s*' % (
                '|'.join(self.site.namespaces[10]), re.escape(template)), text)
            if not start_match:
                if not init:
                    pywikibot.error("Couldn't find the template")
                return text, 0

            start = start_match.start()
            if len(fielddict) > 0:
                end = text.index('|', start)
            else:
                end = text.index('}}', start)

            unnamed = {}
            for name, value in chain(fielddict.items(), IterUnnamed(unnamed)):
                end += len('|%s=%s' % (name, value))

                name = name.strip()
                value = (value
                         .replace('\n<!-- Zastaralé parametry -->', '')
                         .replace('\n<!-- Neznámé parametry -->', '')
                         .strip())

                try:
                    new_name = self.handle_param(name)
                except OldParamException:
                    if textlib.removeDisabledParts(value, ['comments']).strip():
                        old_params.append(
                            (name, value)
                        )
                except RemoveParamException:
                    changed = True
                    if textlib.removeDisabledParts(value, ['comments']).strip():
                        removed_params.append(
                            (name, value)
                        )
                except UnknownParamException:
                    if textlib.removeDisabledParts(value, ['comments']).strip():
                        unknown_params.append(
                            (name, value)
                        )
                except AssertionError:
                    pywikibot.error('Couldn\'t handle parameter "%s"' % name)
                    return text, 0
                except UnnamedParamException:
                    unnamed[value] = ''
                else:
                    new_params.append(
                        (new_name, value)
                    )
                    if new_name != name:
                        changed = True

            end += len('}}')

            while text[start:end].count('{{') < text[start:end].count('}}'):
                end = text[:end].rindex('}}') + len('}}')

            if text[start:end].count('{{') > text[start:end].count('}}'):
                ballance = 1
                end = start
                while ballance > 0:
                    next_close = text.index('}}', end)
                    ballance += text[end:next_close].count('{{') - 1
                    end = next_close + len('}}')

            if not text[start:end].endswith('}}'):  # elif?
                end = text[:end].rindex('}}') + len('}}')

            if (end < start or not text[start:end].endswith('}}') or
                    text[start:end].count('{{') != text[start:end].count('}}')):
                pywikibot.error("Couldn't parse the template")
                return text, 0
            break

        else:
            pywikibot.error("Couldn't parse the template")
            return text, 0

        if not changed:
            pywikibot.output('No parameters changed')
            return text, 0

        while end < len(text) and text[end].isspace():  # todo: also before
            end += 1

        lines = []
        nested = 0
        for line in text[start:end].splitlines():
            if nested == 1 and re.match(' *\|', line):
                lines.append(line)
            nested += line.count('{{') - line.count('}}')

        space_before = ''
        if len(lines) > 0 and choice(lines).startswith(' '):
            space_before = ' '

        self.handle_params(new_params, old_params, removed_params, unknown_params)
        self.deduplicate(new_params)
        new_params.sort(key=self.key_for_sort)

        new_template = '{{%s' % self.new_template
        if len(new_params) > 0:
            new_template += '\n'
            for param, value in new_params:
                new_template += '%s| %s = %s\n' % (space_before, param, value)

        if len(old_params) > 0:
            new_template += '<!-- Zastaralé parametry -->\n'
            for param, value in old_params:
                new_template += '%s| %s = %s\n' % (space_before, param, value)

        if len(unknown_params) > 0:
            new_template += '<!-- Neznámé parametry -->\n'
            for param, value in unknown_params:
                new_template += '%s| %s = %s\n' % (space_before, param, value)

        new_template += '}}\n'

        return text[:start] + new_template + text[end:], end
Esempio n. 35
0
    def run(self):
        """Run the Bot."""
        try:
            deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
        except IOError:
            raise NotImplementedError(
                '404-links.txt is required for reflinks.py\n'
                'You need to download\n'
                'http://www.twoevils.org/files/wikipedia/404-links.txt.gz\n'
                'and to unzip it in the same directory')

        editedpages = 0
        for page in self.generator:
            try:
                # Load the page's text from the wiki
                new_text = page.get()
                if not page.canBeEdited():
                    pywikibot.output(u"You can't edit page %s"
                                     % page.title(asLink=True))
                    continue
            except pywikibot.NoPage:
                pywikibot.output(u'Page %s not found' % page.title(asLink=True))
                continue
            except pywikibot.IsRedirectPage:
                pywikibot.output(u'Page %s is a redirect'
                                 % page.title(asLink=True))
                continue

            # for each link to change
            for match in linksInRef.finditer(
                    textlib.removeDisabledParts(page.get())):

                link = match.group(u'url')
                # debugging purpose
                # print link
                if u'jstor.org' in link:
                    # TODO: Clean URL blacklist
                    continue

                ref = RefLink(link, match.group('name'))
                f = None

                try:
                    f = comms.http.fetch(
                        ref.url, use_fake_user_agent=self._use_fake_user_agent)

                    # Try to get Content-Type from server
                    contentType = f.response_headers.get('content-type')
                    if contentType and not self.MIME.search(contentType):
                        if ref.link.lower().endswith('.pdf') and \
                           not self.getOption('ignorepdf'):
                            # If file has a PDF suffix
                            self.getPDFTitle(ref, f)
                        else:
                            pywikibot.output(color_format(
                                '{lightyellow}WARNING{default} : '
                                'media : {0} ', ref.link))
                        if ref.title:
                            if not re.match(
                                    u'(?i) *microsoft (word|excel|visio)',
                                    ref.title):
                                ref.transform(ispdf=True)
                                repl = ref.refTitle()
                            else:
                                pywikibot.output(color_format(
                                    '{lightyellow}WARNING{default} : '
                                    'PDF title blacklisted : {0} ', ref.title))
                                repl = ref.refLink()
                        else:
                            repl = ref.refLink()
                        new_text = new_text.replace(match.group(), repl)
                        continue

                    # Get the real url where we end (http redirects !)
                    redir = f.data.url
                    if redir != ref.link and \
                       domain.findall(redir) == domain.findall(link):
                        if soft404.search(redir) and \
                           not soft404.search(ref.link):
                            pywikibot.output(color_format(
                                '{lightyellow}WARNING{default} : '
                                'Redirect 404 : {0} ', ref.link))
                            continue
                        if dirIndex.match(redir) and \
                           not dirIndex.match(ref.link):
                            pywikibot.output(color_format(
                                u'{lightyellow}WARNING{default} : '
                                u'Redirect to root : {0} ', ref.link))
                            continue

                    if f.status != requests.codes.ok:
                        pywikibot.output(u'HTTP error (%s) for %s on %s'
                                         % (f.status, ref.url,
                                            page.title(asLink=True)),
                                         toStdout=True)
                        # 410 Gone, indicates that the resource has been purposely
                        # removed
                        if f.status == 410 or \
                           (f.status == 404 and (u'\t%s\t' % ref.url in deadLinks)):
                            repl = ref.refDead()
                            new_text = new_text.replace(match.group(), repl)
                        continue

                    linkedpagetext = f.raw
                except UnicodeError:
                    # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
                    # in [[fr:Cyanure]]
                    pywikibot.output(color_format(
                        '{lightred}Bad link{default} : {0} in {1}',
                        ref.url, page.title(asLink=True)))
                    continue
                except (URLError,
                        socket.error,
                        IOError,
                        httplib.error) as e:
                    pywikibot.output(u'Can\'t retrieve page %s : %s'
                                     % (ref.url, e))
                    continue

                # remove <script>/<style>/comments/CDATA tags
                linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)

                meta_content = self.META_CONTENT.search(linkedpagetext)
                enc = []
                s = None
                if contentType:
                    # use charset from http header
                    s = self.CHARSET.search(contentType)
                if meta_content:
                    tag = meta_content.group()
                    # Prefer the contentType from the HTTP header :
                    if not contentType:
                        contentType = tag
                    if not s:
                        # use charset from html
                        s = self.CHARSET.search(str(tag))
                if s:
                    tmp = s.group('enc').strip("\"' ").lower()
                    naked = re.sub(r'[ _\-]', '', tmp)
                    # Convert to python correct encoding names
                    if naked == "gb2312":
                        enc.append("gbk")
                    elif naked == "shiftjis":
                        enc.append("shift jis 2004")
                        enc.append("cp932")
                    elif naked == "xeucjp":
                        enc.append("euc-jp")
                    else:
                        enc.append(tmp)
                else:
                    pywikibot.output(u'No charset found for %s' % ref.link)
                if not contentType:
                    pywikibot.output(u'No content-type found for %s' % ref.link)
                    continue
                elif not self.MIME.search(contentType):
                    pywikibot.output(color_format(
                        '{lightyellow}WARNING{default} : media : {0} ',
                        ref.link))
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    continue

                # Ugly hacks to try to survive when both server and page
                # return no encoding.
                # Uses most used encodings for each national suffix
                if u'.ru' in ref.link or u'.su' in ref.link:
                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
                    # encoding, no page encoding
                    enc = enc + ['koi8-r', 'windows-1251']
                elif u'.jp' in ref.link:
                    enc.append("shift jis 2004")
                    enc.append("cp932")
                elif u'.kr' in ref.link:
                    enc.append("euc-kr")
                    enc.append("cp949")
                elif u'.zh' in ref.link:
                    enc.append("gbk")

                if 'utf-8' not in enc:
                    enc.append('utf-8')
                try:
                    u = linkedpagetext.decode(enc[0])   # Bug T69410
                except (UnicodeDecodeError, LookupError) as e:
                    pywikibot.output(u'%s : Decoding error - %s' % (ref.link, e))
                    continue

                # Retrieves the first non empty string inside <title> tags
                for m in self.TITLE.finditer(u):
                    t = m.group()
                    if t:
                        ref.title = t
                        ref.transform()
                        if ref.title:
                            break

                if not ref.title:
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'%s : No title found...' % ref.link)
                    continue

                # XXX Ugly hack
                if u'é' in ref.title:
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'%s : Hybrid encoding...' % ref.link)
                    continue

                if self.titleBlackList.match(ref.title):
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(color_format(
                        '{lightred}WARNING{default} {0} : '
                        'Blacklisted title ({1})', ref.link, ref.title))
                    continue

                # Truncate long titles. 175 is arbitrary
                if len(ref.title) > 175:
                    ref.title = ref.title[:175] + "..."

                repl = ref.refTitle()
                new_text = new_text.replace(match.group(), repl)

            # Add <references/> when needed, but ignore templates !
            if page.namespace != 10:
                if self.norefbot.lacksReferences(new_text):
                    new_text = self.norefbot.addReferences(new_text)

            new_text = self.deduplicator.process(new_text)
            old_text = page.text

            self.userPut(page, old_text, new_text, summary=self.msg,
                         ignore_save_related_errors=True,
                         ignore_server_errors=True)

            if new_text == old_text:
                continue
            else:
                editedpages += 1

            if self.getOption('limit') and editedpages >= self.getOption('limit'):
                pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit'))
                return

            if self.site_stop_page and editedpages % 20 == 0:
                self.stop_page = pywikibot.Page(self.site, self.site_stop_page)
                if self.stop_page.exists():
                    pywikibot.output(color_format(
                        '{lightgreen}Checking stop page...{default}'))
                    actual_rev = self.stop_page.latest_revision_id
                    if actual_rev != self.stop_page_rev_id:
                        pywikibot.output(
                            '%s has been edited : Someone wants us to stop.'
                            % self.stop_page.title(asLink=True))
                        return
Esempio n. 36
0
def add_text(page,
             addText,
             summary=None,
             regexSkip=None,
             regexSkipUrl=None,
             always=False,
             up=False,
             putText=True,
             oldTextGiven=None,
             reorderEnabled=True,
             create=False):
    """
    Add text to a page.

    @rtype: tuple of (text, newtext, always)
    """
    site = page.site
    if not summary:
        summary = i18n.twtranslate(site, 'add_text-adding',
                                   {'adding': addText[:200]})

    # When a page is tagged as "really well written" it has a star in the
    # interwiki links. This is a list of all the templates used (in regex
    # format) to make the stars appear.

    errorCount = 0

    if putText:
        pywikibot.output(u'Loading %s...' % page.title())
    if oldTextGiven is None:
        try:
            text = page.get()
        except pywikibot.NoPage:
            if create:
                pywikibot.output(u"%s doesn't exist, creating it!" %
                                 page.title())
                text = u''
            else:
                pywikibot.output(u"%s doesn't exist, skip!" % page.title())
                return (False, False, always)
        except pywikibot.IsRedirectPage:
            pywikibot.output(u"%s is a redirect, skip!" % page.title())
            return (False, False, always)
    else:
        text = oldTextGiven
    # Understand if the bot has to skip the page or not
    # In this way you can use both -except and -excepturl
    if regexSkipUrl is not None:
        url = page.full_url()
        result = re.findall(regexSkipUrl, site.getUrl(url))
        if result != []:
            pywikibot.output('Exception! regex (or word) used with -exceptUrl '
                             'is in the page. Skip!\n'
                             'Match was: %s' % result)
            return (False, False, always)
    if regexSkip is not None:
        result = re.findall(regexSkip, text)
        if result != []:
            pywikibot.output('Exception! regex (or word) used with -except '
                             'is in the page. Skip!\n'
                             'Match was: %s' % result)
            return (False, False, always)
    # If not up, text put below
    if not up:
        newtext = text
        # Translating the \\n into binary \n
        addText = addText.replace('\\n', config.line_separator)
        if (reorderEnabled):
            # Getting the categories
            categoriesInside = textlib.getCategoryLinks(newtext, site)
            # Deleting the categories
            newtext = textlib.removeCategoryLinks(newtext, site)
            # Getting the interwiki
            interwikiInside = textlib.getLanguageLinks(newtext, site)
            # Removing the interwiki
            newtext = textlib.removeLanguageLinks(newtext, site)

            # Adding the text
            newtext += u"%s%s" % (config.line_separator, addText)
            # Reputting the categories
            newtext = textlib.replaceCategoryLinks(newtext, categoriesInside,
                                                   site, True)
            # Dealing the stars' issue
            allstars = []
            starstext = textlib.removeDisabledParts(text)
            for star in starsList:
                regex = re.compile(
                    '(\{\{(?:template:|)%s\|.*?\}\}[\s]*)' % star, re.I)
                found = regex.findall(starstext)
                if found != []:
                    newtext = regex.sub('', newtext)
                    allstars += found
            if allstars != []:
                newtext = newtext.strip() + config.line_separator * 2
                allstars.sort()
                for element in allstars:
                    newtext += '%s%s' % (element.strip(), config.LS)
            # Adding the interwiki
            newtext = textlib.replaceLanguageLinks(newtext, interwikiInside,
                                                   site)
        else:
            newtext += u"%s%s" % (config.line_separator, addText)
    else:
        newtext = addText + config.line_separator + text
    if putText and text != newtext:
        pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" %
                         page.title())
        pywikibot.showDiff(text, newtext)
    # Let's put the changes.
    while True:
        # If someone load it as module, maybe it's not so useful to put the
        # text in the page
        if putText:
            if not always:
                choice = pywikibot.input_choice(
                    u'Do you want to accept these changes?',
                    [('Yes', 'y'), ('No', 'n'), ('All', 'a'),
                     ('open in Browser', 'b')],
                    'n',
                    automatic_quit=False)
                if choice == 'a':
                    always = True
                elif choice == 'n':
                    return (False, False, always)
                elif choice == 'b':
                    pywikibot.bot.open_webbrowser(page)
            if always or choice == 'y':
                try:
                    if always:
                        page.put(newtext,
                                 summary,
                                 minorEdit=page.namespace() != 3)
                    else:
                        page.put_async(newtext,
                                       summary,
                                       minorEdit=page.namespace() != 3)
                except pywikibot.EditConflict:
                    pywikibot.output(u'Edit conflict! skip!')
                    return (False, False, always)
                except pywikibot.ServerError:
                    errorCount += 1
                    if errorCount < config.max_retries:
                        pywikibot.output(u'Server Error! Wait..')
                        time.sleep(config.retry_wait)
                        continue
                    else:
                        raise pywikibot.ServerError(u'Fifth Server Error!')
                except pywikibot.SpamfilterError as e:
                    pywikibot.output(
                        u'Cannot change %s because of blacklist entry %s' %
                        (page.title(), e.url))
                    return (False, False, always)
                except pywikibot.LockedPage:
                    pywikibot.output(u'Skipping %s (locked page)' %
                                     page.title())
                    return (False, False, always)
                except pywikibot.PageNotSaved as error:
                    pywikibot.output(u'Error putting page: %s' % error.args)
                    return (False, False, always)
                else:
                    # Break only if the errors are one after the other...
                    errorCount = 0
                    return (True, True, always)
        else:
            return (text, newtext, always)
def processRE(param, rx):
    cleaned_text = textlib.removeDisabledParts(unicode(param.value.strip()))
    relist = re.findall(rx, cleaned_text)
    return relist
Esempio n. 38
0
 def remove_cats_and_comments(self, text):
     """Remove categories, comments and trailing spaces from wikitext."""
     text = textlib.removeCategoryLinks(text, site=self.site)
     text = textlib.removeDisabledParts(text, tags=['comments'])
     return text.strip()
Esempio n. 39
0
    def run(self):
        """Run the Bot."""
        try:
            deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
        except IOError:
            pywikibot.output(
                'You need to download '
                'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
                'and to ungzip it in the same directory')
            raise
        socket.setdefaulttimeout(30)
        editedpages = 0
        for page in self.generator:
            try:
                # Load the page's text from the wiki
                new_text = page.get()
                if not page.canBeEdited():
                    pywikibot.output(u"You can't edit page %s"
                                      % page.title(asLink=True))
                    continue
            except pywikibot.NoPage:
                pywikibot.output(u'Page %s not found' % page.title(asLink=True))
                continue
            except pywikibot.IsRedirectPage:
                pywikibot.output(u'Page %s is a redirect'
                                 % page.title(asLink=True))
                continue

            # for each link to change
            for match in linksInRef.finditer(
                    textlib.removeDisabledParts(page.get())):

                link = match.group(u'url')
                # debugging purpose
                # print link
                if u'jstor.org' in link:
                    # TODO: Clean URL blacklist
                    continue

                ref = RefLink(link, match.group('name'))
                f = None
                try:
                    socket.setdefaulttimeout(20)
                    try:
                        f = urlopen(ref.url.decode("utf8"))
                    except UnicodeError:
                        ref.url = quote(ref.url.encode("utf8"), "://")
                        f = urlopen(ref.url)
                    # Try to get Content-Type from server
                    headers = f.info()
                    contentType = headers.getheader('Content-Type')
                    if contentType and not self.MIME.search(contentType):
                        if ref.link.lower().endswith('.pdf') and \
                           not self.getOption('ignorepdf'):
                            # If file has a PDF suffix
                            self.getPDFTitle(ref, f)
                        else:
                            pywikibot.output(
                                u'\03{lightyellow}WARNING\03{default} : '
                                u'media : %s ' % ref.link)
                        if ref.title:
                            if not re.match(
                                    u'(?i) *microsoft (word|excel|visio)',
                                    ref.title):
                                ref.transform(ispdf=True)
                                repl = ref.refTitle()
                            else:
                                pywikibot.output(
                                    u'\03{lightyellow}WARNING\03{default} : '
                                    u'PDF title blacklisted : %s ' % ref.title)
                                repl = ref.refLink()
                        else:
                            repl = ref.refLink()
                        new_text = new_text.replace(match.group(), repl)
                        continue
                    # Get the real url where we end (http redirects !)
                    redir = f.geturl()
                    if redir != ref.link and \
                       domain.findall(redir) == domain.findall(link):
                        if soft404.search(redir) and \
                           not soft404.search(ref.link):
                            pywikibot.output(
                                u'\03{lightyellow}WARNING\03{default} : '
                                u'Redirect 404 : %s ' % ref.link)
                            continue
                        if dirIndex.match(redir) and \
                           not dirIndex.match(ref.link):
                            pywikibot.output(
                                u'\03{lightyellow}WARNING\03{default} : '
                                u'Redirect to root : %s ' % ref.link)
                            continue

                    # uncompress if necessary
                    if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
                        # XXX: small issue here: the whole page is downloaded
                        # through f.read(). It might fetch big files/pages.
                        # However, truncating an encoded gzipped stream is not
                        # an option, or unzipping will fail.
                        compressed = io.BytesIO(f.read())
                        f = gzip.GzipFile(fileobj=compressed)

                    # Read the first 1,000,000 bytes (0.95 MB)
                    linkedpagetext = f.read(1000000)
                    socket.setdefaulttimeout(None)

                except UnicodeError:
                    # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
                    # in [[fr:Cyanure]]
                    pywikibot.output(
                        u'\03{lightred}Bad link\03{default} : %s in %s'
                        % (ref.url, page.title(asLink=True)))
                    continue
                except HTTPError as e:
                    pywikibot.output(u'HTTP error (%s) for %s on %s'
                                     % (e.code, ref.url,
                                        page.title(asLink=True)),
                                     toStdout=True)
                    # 410 Gone, indicates that the resource has been purposely
                    # removed
                    if e.code == 410 or \
                       (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
                        repl = ref.refDead()
                        new_text = new_text.replace(match.group(), repl)
                    continue
                except (URLError,
                        socket.error,
                        IOError,
                        httplib.error) as e:
                    pywikibot.output(u'Can\'t retrieve page %s : %s'
                                     % (ref.url, e))
                    continue
                except ValueError:
                    # Known bug of httplib, google for :
                    # "httplib raises ValueError reading chunked content"
                    continue
                finally:
                    if f:
                        f.close()

                # remove <script>/<style>/comments/CDATA tags
                linkedpagetext = self.NON_HTML.sub('', linkedpagetext)

                meta_content = self.META_CONTENT.search(linkedpagetext)
                enc = []
                s = None
                if contentType:
                    # use charset from http header
                    s = self.CHARSET.search(contentType)
                if meta_content:
                    tag = meta_content.group()
                    # Prefer the contentType from the HTTP header :
                    if not contentType:
                        contentType = tag
                    if not s:
                        # use charset from html
                        s = self.CHARSET.search(tag)
                if s:
                    tmp = s.group('enc').strip("\"' ").lower()
                    naked = re.sub(r'[ _\-]', '', tmp)
                    # Convert to python correct encoding names
                    if naked == "gb2312":
                        enc.append("gbk")
                    elif naked == "shiftjis":
                        enc.append("shift jis 2004")
                        enc.append("cp932")
                    elif naked == "xeucjp":
                        enc.append("euc-jp")
                    else:
                        enc.append(tmp)
                else:
                    pywikibot.output(u'No charset found for %s' % ref.link)
                if not contentType:
                    pywikibot.output(u'No content-type found for %s' % ref.link)
                    continue
                elif not self.MIME.search(contentType):
                    pywikibot.output(
                        u'\03{lightyellow}WARNING\03{default} : media : %s '
                        % ref.link)
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    continue

                # Ugly hacks to try to survive when both server and page
                # return no encoding.
                # Uses most used encodings for each national suffix
                if u'.ru' in ref.link or u'.su' in ref.link:
                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
                    # encoding, no page encoding
                    enc = enc + ['koi8-r', 'windows-1251']
                elif u'.jp' in ref.link:
                    enc.append("shift jis 2004")
                    enc.append("cp932")
                elif u'.kr' in ref.link:
                    enc.append("euc-kr")
                    enc.append("cp949")
                elif u'.zh' in ref.link:
                    enc.append("gbk")

                if 'utf-8' not in enc:
                    enc.append('utf-8')
                try:
                    u = linkedpagetext.decode(enc[0])   # Bug 67410
                except (UnicodeDecodeError, LookupError) as e:
                    pywikibot.output(u'%s : Decoding error - %s' % (ref.link, e))
                    continue

                # Retrieves the first non empty string inside <title> tags
                for m in self.TITLE.finditer(u):
                    t = m.group()
                    if t:
                        ref.title = t
                        ref.transform()
                        if ref.title:
                            break

                if not ref.title:
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'%s : No title found...' % ref.link)
                    continue

                # XXX Ugly hack
                if u'é' in ref.title:
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'%s : Hybrid encoding...' % ref.link)
                    continue

                if self.titleBlackList.match(ref.title):
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'\03{lightred}WARNING\03{default} %s : '
                                     u'Blacklisted title (%s)'
                                     % (ref.link, ref.title))
                    continue

                # Truncate long titles. 175 is arbitrary
                if len(ref.title) > 175:
                    ref.title = ref.title[:175] + "..."

                repl = ref.refTitle()
                new_text = new_text.replace(match.group(), repl)

            # Add <references/> when needed, but ignore templates !
            if page.namespace != 10:
                if self.norefbot.lacksReferences(new_text):
                    new_text = self.norefbot.addReferences(new_text)

            new_text = self.deduplicator.process(new_text)

            self.userPut(page, page.text, new_text, comment=self.msg,
                         ignore_save_related_errors=True,
                         ignore_server_errors=True)

            if new_text == page.text:
                continue
            else:
                editedpages += 1

            if self.getOption('limit') and editedpages >= self.getOption('limit'):
                pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit'))
                return

            if editedpages % 20 == 0:
                pywikibot.output(
                    '\03{lightgreen}Checking stop page...\03{default}')
                actualRev = self.stopPage.latest_revision_id
                if actualRev != self.stopPageRevId:
                    pywikibot.output(
                        u'[[%s]] has been edited : Someone wants us to stop.'
                        % self.stopPage)
                    return
Esempio n. 40
0
    def run(self):
        """Run the Bot."""
        try:
            deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
        except IOError:
            pywikibot.output(
                'You need to download '
                'http://www.twoevils.org/files/wikipedia/404-links.txt.gz '
                'and to ungzip it in the same directory')
            raise

        editedpages = 0
        for page in self.generator:
            try:
                # Load the page's text from the wiki
                new_text = page.get()
                if not page.canBeEdited():
                    pywikibot.output(u"You can't edit page %s"
                                     % page.title(asLink=True))
                    continue
            except pywikibot.NoPage:
                pywikibot.output(u'Page %s not found' % page.title(asLink=True))
                continue
            except pywikibot.IsRedirectPage:
                pywikibot.output(u'Page %s is a redirect'
                                 % page.title(asLink=True))
                continue

            # for each link to change
            for match in linksInRef.finditer(
                    textlib.removeDisabledParts(page.get())):

                link = match.group(u'url')
                # debugging purpose
                # print link
                if u'jstor.org' in link:
                    # TODO: Clean URL blacklist
                    continue

                ref = RefLink(link, match.group('name'))
                f = None

                try:
                    f = comms.http.fetch(
                        ref.url, use_fake_user_agent=self._use_fake_user_agent)

                    # Try to get Content-Type from server
                    contentType = f.response_headers.get('content-type')
                    if contentType and not self.MIME.search(contentType):
                        if ref.link.lower().endswith('.pdf') and \
                           not self.getOption('ignorepdf'):
                            # If file has a PDF suffix
                            self.getPDFTitle(ref, f)
                        else:
                            pywikibot.output(color_format(
                                '{lightyellow}WARNING{default} : '
                                'media : {0} ', ref.link))
                        if ref.title:
                            if not re.match(
                                    u'(?i) *microsoft (word|excel|visio)',
                                    ref.title):
                                ref.transform(ispdf=True)
                                repl = ref.refTitle()
                            else:
                                pywikibot.output(color_format(
                                    '{lightyellow}WARNING{default} : '
                                    'PDF title blacklisted : {0} ', ref.title))
                                repl = ref.refLink()
                        else:
                            repl = ref.refLink()
                        new_text = new_text.replace(match.group(), repl)
                        continue

                    # Get the real url where we end (http redirects !)
                    redir = f.data.url
                    if redir != ref.link and \
                       domain.findall(redir) == domain.findall(link):
                        if soft404.search(redir) and \
                           not soft404.search(ref.link):
                            pywikibot.output(color_format(
                                '{lightyellow}WARNING{default} : '
                                'Redirect 404 : {0} ', ref.link))
                            continue
                        if dirIndex.match(redir) and \
                           not dirIndex.match(ref.link):
                            pywikibot.output(color_format(
                                u'{lightyellow}WARNING{default} : '
                                u'Redirect to root : {0} ', ref.link))
                            continue

                    if f.status != requests.codes.ok:
                        pywikibot.output(u'HTTP error (%s) for %s on %s'
                                         % (f.status, ref.url,
                                            page.title(asLink=True)),
                                         toStdout=True)
                        # 410 Gone, indicates that the resource has been purposely
                        # removed
                        if f.status == 410 or \
                           (f.status == 404 and (u'\t%s\t' % ref.url in deadLinks)):
                            repl = ref.refDead()
                            new_text = new_text.replace(match.group(), repl)
                        continue

                    linkedpagetext = f.content
                except UnicodeError:
                    # example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html
                    # in [[fr:Cyanure]]
                    pywikibot.output(color_format(
                        '{lightred}Bad link{default} : {0} in {1}',
                        ref.url, page.title(asLink=True)))
                    continue
                except (URLError,
                        socket.error,
                        IOError,
                        httplib.error) as e:
                    pywikibot.output(u'Can\'t retrieve page %s : %s'
                                     % (ref.url, e))
                    continue

                # remove <script>/<style>/comments/CDATA tags
                linkedpagetext = self.NON_HTML.sub(b'', linkedpagetext)

                meta_content = self.META_CONTENT.search(linkedpagetext)
                enc = []
                s = None
                if contentType:
                    # use charset from http header
                    s = self.CHARSET.search(contentType)
                if meta_content:
                    tag = meta_content.group()
                    # Prefer the contentType from the HTTP header :
                    if not contentType:
                        contentType = tag
                    if not s:
                        # use charset from html
                        s = self.CHARSET.search(str(tag))
                if s:
                    tmp = s.group('enc').strip("\"' ").lower()
                    naked = re.sub(r'[ _\-]', '', tmp)
                    # Convert to python correct encoding names
                    if naked == "gb2312":
                        enc.append("gbk")
                    elif naked == "shiftjis":
                        enc.append("shift jis 2004")
                        enc.append("cp932")
                    elif naked == "xeucjp":
                        enc.append("euc-jp")
                    else:
                        enc.append(tmp)
                else:
                    pywikibot.output(u'No charset found for %s' % ref.link)
                if not contentType:
                    pywikibot.output(u'No content-type found for %s' % ref.link)
                    continue
                elif not self.MIME.search(contentType):
                    pywikibot.output(color_format(
                        '{lightyellow}WARNING{default} : media : {0} ',
                        ref.link))
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    continue

                # Ugly hacks to try to survive when both server and page
                # return no encoding.
                # Uses most used encodings for each national suffix
                if u'.ru' in ref.link or u'.su' in ref.link:
                    # see http://www.sci.aha.ru/ATL/ra13a.htm : no server
                    # encoding, no page encoding
                    enc = enc + ['koi8-r', 'windows-1251']
                elif u'.jp' in ref.link:
                    enc.append("shift jis 2004")
                    enc.append("cp932")
                elif u'.kr' in ref.link:
                    enc.append("euc-kr")
                    enc.append("cp949")
                elif u'.zh' in ref.link:
                    enc.append("gbk")

                if 'utf-8' not in enc:
                    enc.append('utf-8')
                try:
                    u = linkedpagetext.decode(enc[0])   # Bug T69410
                except (UnicodeDecodeError, LookupError) as e:
                    pywikibot.output(u'%s : Decoding error - %s' % (ref.link, e))
                    continue

                # Retrieves the first non empty string inside <title> tags
                for m in self.TITLE.finditer(u):
                    t = m.group()
                    if t:
                        ref.title = t
                        ref.transform()
                        if ref.title:
                            break

                if not ref.title:
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'%s : No title found...' % ref.link)
                    continue

                # XXX Ugly hack
                if u'é' in ref.title:
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(u'%s : Hybrid encoding...' % ref.link)
                    continue

                if self.titleBlackList.match(ref.title):
                    repl = ref.refLink()
                    new_text = new_text.replace(match.group(), repl)
                    pywikibot.output(color_format(
                        '{lightred}WARNING{default} {0} : '
                        'Blacklisted title ({1})', ref.link, ref.title))
                    continue

                # Truncate long titles. 175 is arbitrary
                if len(ref.title) > 175:
                    ref.title = ref.title[:175] + "..."

                repl = ref.refTitle()
                new_text = new_text.replace(match.group(), repl)

            # Add <references/> when needed, but ignore templates !
            if page.namespace != 10:
                if self.norefbot.lacksReferences(new_text):
                    new_text = self.norefbot.addReferences(new_text)

            new_text = self.deduplicator.process(new_text)

            self.userPut(page, page.text, new_text, summary=self.msg,
                         ignore_save_related_errors=True,
                         ignore_server_errors=True)

            if new_text == page.text:
                continue
            else:
                editedpages += 1

            if self.getOption('limit') and editedpages >= self.getOption('limit'):
                pywikibot.output('Edited %s pages, stopping.' % self.getOption('limit'))
                return

            if editedpages % 20 == 0:
                pywikibot.output(color_format(
                    '{lightgreen}Checking stop page...{default}'))
                actualRev = self.stopPage.latest_revision_id
                if actualRev != self.stopPageRevId:
                    pywikibot.output(
                        u'[[%s]] has been edited : Someone wants us to stop.'
                        % self.stopPage)
                    return
Esempio n. 41
0
def _match_xml_page_text(text):
    """Match page text."""
    text = textlib.removeDisabledParts(text)
    return _ref_regex.search(text) and not _references_regex.search(text)
Esempio n. 42
0
def linkedImages(page):
    """Return a list of Pages that this Page links to.

    Only returns pages from "normal" internal links. Category links are
    omitted unless prefixed with ":". Image links are omitted when parameter
    withImageLinks is False. Embedded templates are omitted (but links
    within them are returned). All interwiki and external links are omitted.

    @param thistxt: the wikitext of the page
    @return: a list of Page objects.
    """

    Rlink = re.compile(r'\[\[(?P<title>[^\]\|\[]*)(\|[^\]]*)?\]\]')
    result = []
    try:
        thistxt = textlib.removeLanguageLinks(page.get(get_redirect=True),
                                              page.site)
    except pywikibot.NoPage:
        raise
    except pywikibot.IsRedirectPage:
        raise
    except pywikibot.SectionError:
        return []
    thistxt = textlib.removeCategoryLinks(thistxt, page.site)

    # remove HTML comments, pre, nowiki, and includeonly sections
    # from text before processing
    thistxt = textlib.removeDisabledParts(thistxt)

    # resolve {{ns:-1}} or {{ns:Help}}
    # thistxt = page.site.resolvemagicwords(thistxt)

    for match in Rlink.finditer(thistxt):
        try:
            #print(match.group(0))
            title = match.group('title')
            title = title.replace("_", " ").strip(" ")
            # print title
            if title == "":
                # empty link - problem in the page
                continue
            # convert relative link to absolute link
            if title.startswith(".."):
                parts = self.title().split('/')
                parts.pop()
                title = '/'.join(parts) + title[2:]
            elif title.startswith("/"):
                title = '%s/%s' % (page.title(), title[1:])
            if title.startswith("#"):
                # this is an internal section link
                continue
            if not page.site.isInterwikiLink(title):
                page2 = pywikibot.Page(page.site, title)
                try:
                    hash(str(page2))
                except Exception:
                    pywikibot.output("Page %s contains invalid link to [[%s]]."
                                 % (page.title(), title))
                    continue
                if not page2.isImage():
                    continue
                if page2.title(withSection=False) and page2 not in result:
                    result.append(page2)
        except pywikibot.NoUsername:
            continue
        except:
            raise
    return result
Esempio n. 43
0
 def remove_cats_and_comments(self, text):
     """Remove categories, comments and trailing spaces from wikitext."""
     text = textlib.removeCategoryLinks(text, site=self.site)
     text = textlib.removeDisabledParts(text, tags=['comments'])
     return text.strip()
Esempio n. 44
0
def _match_xml_page_text(text):
    """Match page text."""
    text = textlib.removeDisabledParts(text)
    return _ref_regex.search(text) and not _references_regex.search(text)
Esempio n. 45
0
def add_text(page, addText, summary=None, regexSkip=None,
             regexSkipUrl=None, always=False, up=False, putText=True,
             oldTextGiven=None, reorderEnabled=True, create=False):
    """
    Add text to a page.

    @rtype: tuple of (text, newtext, always)
    """
    site = page.site
    if not summary:
        summary = i18n.twtranslate(site, 'add_text-adding',
                                   {'adding': addText[:200]})

    # When a page is tagged as "really well written" it has a star in the
    # interwiki links. This is a list of all the templates used (in regex
    # format) to make the stars appear.

    errorCount = 0

    if putText:
        pywikibot.output(u'Loading %s...' % page.title())
    if oldTextGiven is None:
        try:
            text = page.get()
        except pywikibot.NoPage:
            if create:
                pywikibot.output(u"%s doesn't exist, creating it!"
                                 % page.title())
                text = u''
            else:
                pywikibot.output(u"%s doesn't exist, skip!" % page.title())
                return (False, False, always)
        except pywikibot.IsRedirectPage:
            pywikibot.output(u"%s is a redirect, skip!" % page.title())
            return (False, False, always)
    else:
        text = oldTextGiven
    # Understand if the bot has to skip the page or not
    # In this way you can use both -except and -excepturl
    if regexSkipUrl is not None:
        url = page.full_url()
        result = re.findall(regexSkipUrl, site.getUrl(url))
        if result != []:
            pywikibot.output(
                'Exception! regex (or word) used with -exceptUrl '
                'is in the page. Skip!\n'
                'Match was: %s' % result)
            return (False, False, always)
    if regexSkip is not None:
        result = re.findall(regexSkip, text)
        if result != []:
            pywikibot.output(
                'Exception! regex (or word) used with -except '
                'is in the page. Skip!\n'
                'Match was: %s' % result)
            return (False, False, always)
    # If not up, text put below
    if not up:
        newtext = text
        # Translating the \\n into binary \n
        addText = addText.replace('\\n', config.line_separator)
        if (reorderEnabled):
            # Getting the categories
            categoriesInside = textlib.getCategoryLinks(newtext, site)
            # Deleting the categories
            newtext = textlib.removeCategoryLinks(newtext, site)
            # Getting the interwiki
            interwikiInside = textlib.getLanguageLinks(newtext, site)
            # Removing the interwiki
            newtext = textlib.removeLanguageLinks(newtext, site)

            # Adding the text
            newtext += u"%s%s" % (config.line_separator, addText)
            # Reputting the categories
            newtext = textlib.replaceCategoryLinks(newtext,
                                                   categoriesInside, site,
                                                   True)
            # Dealing the stars' issue
            allstars = []
            starstext = textlib.removeDisabledParts(text)
            for star in starsList:
                regex = re.compile('(\{\{(?:template:|)%s\|.*?\}\}[\s]*)'
                                   % star, re.I)
                found = regex.findall(starstext)
                if found != []:
                    newtext = regex.sub('', newtext)
                    allstars += found
            if allstars != []:
                newtext = newtext.strip() + config.line_separator * 2
                allstars.sort()
                for element in allstars:
                    newtext += '%s%s' % (element.strip(), config.LS)
            # Adding the interwiki
            newtext = textlib.replaceLanguageLinks(newtext, interwikiInside,
                                                   site)
        else:
            newtext += u"%s%s" % (config.line_separator, addText)
    else:
        newtext = addText + config.line_separator + text
    if putText and text != newtext:
        pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
                         % page.title())
        pywikibot.showDiff(text, newtext)
    # Let's put the changes.
    while True:
        # If someone load it as module, maybe it's not so useful to put the
        # text in the page
        if putText:
            if not always:
                choice = pywikibot.input_choice(
                    u'Do you want to accept these changes?',
                    [('Yes', 'y'), ('No', 'n'), ('All', 'a'),
                     ('open in Browser', 'b')], 'n', automatic_quit=False)
                if choice == 'a':
                    always = True
                elif choice == 'n':
                    return (False, False, always)
                elif choice == 'b':
                    pywikibot.bot.open_webbrowser(page)
            if always or choice == 'y':
                try:
                    if always:
                        page.put(newtext, summary,
                                 minorEdit=page.namespace() != 3)
                    else:
                        page.put_async(newtext, summary,
                                       minorEdit=page.namespace() != 3)
                except pywikibot.EditConflict:
                    pywikibot.output(u'Edit conflict! skip!')
                    return (False, False, always)
                except pywikibot.ServerError:
                    errorCount += 1
                    if errorCount < config.max_retries:
                        pywikibot.output(u'Server Error! Wait..')
                        time.sleep(config.retry_wait)
                        continue
                    else:
                        raise pywikibot.ServerError(u'Fifth Server Error!')
                except pywikibot.SpamfilterError as e:
                    pywikibot.output(
                        u'Cannot change %s because of blacklist entry %s'
                        % (page.title(), e.url))
                    return (False, False, always)
                except pywikibot.LockedPage:
                    pywikibot.output(u'Skipping %s (locked page)'
                                     % page.title())
                    return (False, False, always)
                except pywikibot.PageNotSaved as error:
                    pywikibot.output(u'Error putting page: %s' % error.args)
                    return (False, False, always)
                else:
                    # Break only if the errors are one after the other...
                    errorCount = 0
                    return (True, True, always)
        else:
            return (text, newtext, always)
Esempio n. 46
0
    def standardizePageFooter(self, text):
        """
        Standardize page footer.

        Makes sure that interwiki links, categories and star templates are
        put to the correct position and into the right order. This combines the
        old instances standardizeInterwiki and standardizeCategories
        The page footer has the following section in that sequence:
        1. categories
        2. ## TODO: template beyond categories ##
        3. additional information depending on local site policy
        4. stars templates for featured and good articles
        5. interwiki links

        """
        starsList = [
            u'bueno',
            u'bom interwiki',
            u'cyswllt[ _]erthygl[ _]ddethol', u'dolen[ _]ed',
            u'destacado', u'destaca[tu]',
            u'enllaç[ _]ad',
            u'enllaz[ _]ad',
            u'leam[ _]vdc',
            u'legătură[ _]a[bcf]',
            u'liamm[ _]pub',
            u'lien[ _]adq',
            u'lien[ _]ba',
            u'liên[ _]kết[ _]bài[ _]chất[ _]lượng[ _]tốt',
            u'liên[ _]kết[ _]chọn[ _]lọc',
            u'ligam[ _]adq',
            u'ligazón[ _]a[bd]',
            u'ligoelstara',
            u'ligoleginda',
            u'link[ _][afgu]a', u'link[ _]adq', u'link[ _]f[lm]', u'link[ _]km',
            u'link[ _]sm', u'linkfa',
            u'na[ _]lotura',
            u'nasc[ _]ar',
            u'tengill[ _][úg]g',
            u'ua',
            u'yüm yg',
            u'רא',
            u'وصلة مقالة جيدة',
            u'وصلة مقالة مختارة',
        ]

        categories = None
        interwikiLinks = None
        allstars = []

        # The PyWikipediaBot is no longer allowed to touch categories on the
        # German Wikipedia. See
        # https://de.wikipedia.org/wiki/Hilfe_Diskussion:Personendaten/Archiv/1#Position_der_Personendaten_am_.22Artikelende.22
        # ignoring nn-wiki of cause of the comment line above iw section
        if not self.template and '{{Personendaten' not in text and \
           '{{SORTIERUNG' not in text and '{{DEFAULTSORT' not in text and \
           self.site.code not in ('et', 'it', 'bg', 'ru'):
            categories = textlib.getCategoryLinks(text, site=self.site)

        if not self.talkpage:  # and pywikibot.calledModuleName() <> 'interwiki':
            subpage = False
            if self.template:
                loc = None
                try:
                    tmpl, loc = moved_links[self.site.code]
                    del tmpl
                except KeyError:
                    pass
                if loc is not None and loc in self.title:
                    subpage = True
            interwikiLinks = textlib.getLanguageLinks(
                text, insite=self.site, template_subpage=subpage)

            # Removing the interwiki
            text = textlib.removeLanguageLinks(text, site=self.site)
            # Removing the stars' issue
            starstext = textlib.removeDisabledParts(text)
            for star in starsList:
                regex = re.compile(r'(\{\{(?:template:|)%s\|.*?\}\}[\s]*)'
                                   % star, re.I)
                found = regex.findall(starstext)
                if found != []:
                    text = regex.sub('', text)
                    allstars += found

        # Adding categories
        if categories:
            # TODO: Sorting categories in alphabetic order.
            # e.g. using categories.sort()

            # TODO: Taking main cats to top
            #   for name in categories:
            #       if re.search(u"(.+?)\|(.{,1}?)",name.title()) or name.title()==name.title().split(":")[0]+title:
            #            categories.remove(name)
            #            categories.insert(0, name)
            text = textlib.replaceCategoryLinks(text, categories,
                                                site=self.site)
        # Adding stars templates
        if allstars:
            text = text.strip() + self.site.family.interwiki_text_separator
            allstars.sort()
            for element in allstars:
                text += '%s%s' % (element.strip(), config.line_separator)
                pywikibot.log(u'%s' % element.strip())
        # Adding the interwiki
        if interwikiLinks:
            text = textlib.replaceLanguageLinks(text, interwikiLinks,
                                                site=self.site,
                                                template=self.template,
                                                template_subpage=subpage)
        return text
    def find_and_replace(self, text, init):
        new_params = []
        old_params = []
        unknown_params = []
        removed_params = []
        changed = False
        for template, fielddict in textlib.extract_templates_and_params(
                text, remove_disabled_parts=False, strip=False):
            if self.normalize(template) not in (self.template,
                                                self.new_template):
                continue

            changed = self.normalize(template) != self.new_template
            start_match = re.search(r'\{\{\s*((%s)\s*:\s*)?%s\s*' % (
                '|'.join(self.site.namespaces[10]), re.escape(template)), text)
            if not start_match:
                if not init:
                    pywikibot.error("Couldn't find the template")
                return text, 0

            start = start_match.start()
            if len(fielddict) > 0:
                end = text.index('|', start)
            else:
                end = text.index('}}', start)

            unnamed = {}
            for name, value in chain(fielddict.items(), IterUnnamed(unnamed)):
                end += len('|%s=%s' % (name, value))

                name = name.strip()
                value = (value
                         .replace('\n<!-- Zastaralé parametry -->', '')
                         .replace('\n<!-- Neznámé parametry -->', '')
                         .strip())

                try:
                    new_name = self.handle_param(name)
                except OldParamException:
                    if textlib.removeDisabledParts(value, ['comments']).strip():
                        old_params.append(
                            (name, value)
                        )
                except RemoveParamException:
                    changed = True
                    if textlib.removeDisabledParts(value, ['comments']).strip():
                        removed_params.append(
                            (name, value)
                        )
                except UnknownParamException:
                    if textlib.removeDisabledParts(value, ['comments']).strip():
                        unknown_params.append(
                            (name, value)
                        )
                except AssertionError:
                    pywikibot.error('Couldn\'t handle parameter "%s"' % name)
                    return text, 0
                except UnnamedParamException:
                    unnamed[value] = ''
                else:
                    new_params.append(
                        (new_name, value)
                    )
                    if new_name != name:
                        changed = True

            end += len('}}')

            while text[start:end].count('{{') < text[start:end].count('}}'):
                end = text[:end].rindex('}}') + len('}}')

            if text[start:end].count('{{') > text[start:end].count('}}'):
                ballance = 1
                end = start
                while ballance > 0:
                    next_close = text.index('}}', end)
                    ballance += text[end:next_close].count('{{') - 1
                    end = next_close + len('}}')

            if not text[start:end].endswith('}}'): # elif?
                end = text[:end].rindex('}}') + len('}}')

            if (end < start or not text[start:end].endswith('}}') or
                    text[start:end].count('{{') != text[start:end].count('}}')):
                pywikibot.error("Couldn't parse the template")
                return text, 0
            break

        else:
            pywikibot.error("Couldn't parse the template")
            return text, 0

        if not changed:
            pywikibot.output('No parameters changed')
            return text, 0

        while end < len(text) and text[end].isspace(): # todo: also before
            end += 1

        lines = []
        nested = 0
        for line in text[start:end].splitlines():
            if nested == 1 and re.match(' *\|', line):
                lines.append(line)
            nested += line.count('{{') - line.count('}}')

        space_before = ''
        if len(lines) > 0 and choice(lines).startswith(' '):
            space_before = ' '

        self.handle_params(new_params, old_params, removed_params, unknown_params)
        self.deduplicate(new_params)
        new_params.sort(key=self.key_for_sort)

        new_template = '{{%s' % self.new_template
        if len(new_params) > 0:
            new_template += '\n'
            for param, value in new_params:
                new_template += '%s| %s = %s\n' % (space_before, param, value)

        if len(old_params) > 0:
            new_template += '<!-- Zastaralé parametry -->\n'
            for param, value in old_params:
                new_template += '%s| %s = %s\n' % (space_before, param, value)

        if len(unknown_params) > 0:
            new_template += '<!-- Neznámé parametry -->\n'
            for param, value in unknown_params:
                new_template += '%s| %s = %s\n' % (space_before, param, value)

        new_template += '}}\n'

        return text[:start] + new_template + text[end:], end