Esempio n. 1
0
    def __init__(self,
                 generator,
                 acceptall=False,
                 limit=None,
                 ignorepdf=False):
        """
        - generator : Page generator
        - acceptall : boolean, is -always on ?
        - limit : int, stop after n modified pages
        - ignorepdf : boolean

        """
        self.generator = generator
        self.acceptall = acceptall
        self.limit = limit
        self.ignorepdf = ignorepdf
        self.site = pywikibot.getSite()
        # Check
        manual = 'mw:Manual:Pywikibot/refLinks'
        code = None
        for alt in [self.site.code] + i18n._altlang(self.site.code):
            if alt in localized_msg:
                code = alt
                break
        if code:
            manual += '/%s' % code
        self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals())
        self.stopPage = pywikibot.Page(
            self.site, pywikibot.translate(self.site, stopPage))

        local = pywikibot.translate(self.site, badtitles)
        if local:
            bad = '(' + globalbadtitles + '|' + local + ')'
        else:
            bad = globalbadtitles
        self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
        self.norefbot = noreferences.NoReferencesBot(None)
        self.deduplicator = DuplicateReferences()
        try:
            self.stopPageRevId = self.stopPage.latestRevision()
        except pywikibot.NoPage:
            pywikibot.output(u'The stop page %s does not exist' %
                             self.stopPage.title(asLink=True))
            raise

        # Regex to grasp content-type meta HTML tag in HTML source
        self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>')
        # Extract the encoding from a charset property (from content-type !)
        self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)')
        # Extract html title from page
        self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)')
        # Matches content inside <script>/<style>/HTML comments
        self.NON_HTML = re.compile(
            ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>'
        )

        # Authorized mime types for HTML pages
        self.MIME = re.compile(
            ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
Esempio n. 2
0
        elif cols:
            text = text.replace(m.group(), '{{reflist|%s}}' % cols.group(2))
        else:
            text = text.replace(m.group(), '{{reflist}}')

    # If more than 30 refs, make sure the reference section is multi column
    if text.count('</ref>') > 30:
        text = re.sub(
            r'(?is)(=\s+(<!--.*?-->)*\s*)(<references />|\{\{reflist\|?3?\}\})',
            r'\1{{reflist|colwidth=30em}}', text)
    elif text.count('</ref>') < 5:
        text = re.sub(r'(?is)(=\s+)\{\{reflist\|(\d+|colwidth=\d+\w+)\}\}',
                      r'\1{{reflist}}', text)

    if noreferences:
        norefbot = noreferences.NoReferencesBot(None)
        if norefbot.lacksReferences(text, verbose=False):
            text = norefbot.addReferences(text)

    return text


def test():
    tests = ("""
see,  <ref  /> after,\t<ref  > class,
<ref />
sdf her
<ref  />
<ref  />
,  \t but would...
""", )