def __init__(self, generator, acceptall=False, limit=None, ignorepdf=False): """ - generator : Page generator - acceptall : boolean, is -always on ? - limit : int, stop after n modified pages - ignorepdf : boolean """ self.generator = generator self.acceptall = acceptall self.limit = limit self.ignorepdf = ignorepdf self.site = pywikibot.getSite() # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/%s' % code self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) self.stopPage = pywikibot.Page( self.site, pywikibot.translate(self.site, stopPage)) local = pywikibot.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None) self.deduplicator = DuplicateReferences() try: self.stopPageRevId = self.stopPage.latestRevision() except pywikibot.NoPage: pywikibot.output(u'The stop page %s does not exist' % self.stopPage.title(asLink=True)) raise # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(ur'(?i)<meta[^>]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)') # Extract html title from page self.TITLE = re.compile(ur'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( ur'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>' ) # Authorized mime types for HTML pages self.MIME = re.compile( ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def __init__(self, generator, **kwargs): """- generator : Page generator.""" self.availableOptions.update({ 'ignorepdf': False, # boolean 'limit': None, # int, stop after n modified pages 'summary': None, }) super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/{0}'.format(code) if self.getOption('summary') is None: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) else: self.msg = self.getOption('summary') local = i18n.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None, verbose=False) self.deduplicator = DuplicateReferences(self.site) self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page {0} does not exist' .format(self.stop_page.title(as_link=True))) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def __init__(self, generator, **kwargs): """ - generator : Page generator """ self.availableOptions.update({ 'ignorepdf': False, # boolean 'limit': None, # int, stop after n modified pages 'summary': None, }) super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/%s' % code if self.getOption('summary') is None: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) else: self.msg = self.getOption('summary') self.stopPage = pywikibot.Page(self.site, pywikibot.translate(self.site, stopPage)) local = pywikibot.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None, verbose=False) self.deduplicator = DuplicateReferences() try: self.stopPageRevId = self.stopPage.latestRevision() except pywikibot.NoPage: pywikibot.output(u'The stop page %s does not exist' % self.stopPage.title(asLink=True)) raise # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(r'(?i)<meta[^>]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( r'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def __init__(self, **kwargs): """Initializer.""" super().__init__(**kwargs) self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check manual = 'mw:Manual:Pywikibot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/{}'.format(code) if self.opt.summary: self.msg = self.opt.summary else: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) local = i18n.translate(self.site, badtitles) if local: bad = '({}|{})'.format(globalbadtitles, local) else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(verbose=False) self.deduplicator = DuplicateReferences(self.site) self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page {} does not exist'.format( self.stop_page.title(as_link=True))) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile( br'(?i)<meta[^>]*(?:content\-type|charset)[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile( r'(?i)charset\s*=\s*(?P<enc>(?P<q>[\'"]?)[^\'",;>/]*(?P=q))') # Extract html title from page self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def __init__(self, generator, acceptall=False, limit=None, ignorepdf=False): """ - generator : Page generator - acceptall : boolean, is -always on ? - limit : int, stop after n modified pages - ignorepdf : boolean """ self.generator = generator self.acceptall = acceptall self.limit = limit self.ignorepdf = ignorepdf self.site = pywikibot.getSite() # Check manual = "mw:Manual:Pywikibot/refLinks" code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += "/%s" % code self.msg = i18n.twtranslate(self.site, "reflinks-msg", locals()) self.stopPage = pywikibot.Page(self.site, pywikibot.translate(self.site, stopPage)) local = pywikibot.translate(self.site, badtitles) if local: bad = "(" + globalbadtitles + "|" + local + ")" else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None) self.deduplicator = DuplicateReferences() try: self.stopPageRevId = self.stopPage.latestRevision() except pywikibot.NoPage: pywikibot.output(u"The stop page %s does not exist" % self.stopPage.title(asLink=True)) raise # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(ur"(?i)<meta[^>]*content\-type[^>]*>") # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P<enc>[^\'";>/]*)') # Extract html title from page self.TITLE = re.compile(ur"(?is)(?<=<title>).*?(?=</title>)") # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( ur"(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|<!--.*?-->|<!\[CDATA\[.*?\]\]>" ) # Authorized mime types for HTML pages self.MIME = re.compile(ur"application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml")
def __init__(self, generator, **kwargs): """- generator : Page generator.""" self.availableOptions.update({ 'ignorepdf': False, # boolean 'repair': False, # boolean 'limit': None, # int, stop after n modified pages 'summary': None, }) super(ReferencesRobot, self).__init__(**kwargs) self.generator = generator self.site = pywikibot.Site() self._use_fake_user_agent = config.fake_user_agent_default.get( 'reflinks', False) # Check #manual = 'mw:Manual:Pywikibot/refLinks' manual = 'Wikipedysta:MastiBot/refLinks' code = None for alt in [self.site.code] + i18n._altlang(self.site.code): if alt in localized_msg: code = alt break if code: manual += '/%s' % code if self.getOption('summary') is None: self.msg = i18n.twtranslate(self.site, 'reflinks-msg', locals()) else: self.msg = self.getOption('summary') local = i18n.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None, verbose=False) #self.deduplicator = DuplicateReferences() self.site_stop_page = i18n.translate(self.site, stop_page) if self.site_stop_page: self.stop_page = pywikibot.Page(self.site, self.site_stop_page) if self.stop_page.exists(): self.stop_page_rev_id = self.stop_page.latest_revision_id else: pywikibot.warning('The stop page %s does not exist' % self.stop_page.title(asLink=True)) # Regex that match bare references if self.getOption('repair'): self.linksInRef = re.compile( ur'(?i)<ref(?P<name>[^>]*)>\.?\[?(?P<url>http[s]?:(\/\/[^:\s\?]+?)(\??[^\s<]*?)[^\]\.])(\]|\]\.)?( [^<]*?<!-- Tytuł wygenerowany przez bota -->[ \t]*\])[ \t]*<\/ref>' ) else: self.linksInRef = re.compile( ur'(?i)<ref(?P<name>[^>]*)>\.?\[?(?P<url>http[s]?:(\/\/[^:\s\?]+?)(\??[^\s<]*?)[^\]\.])(\]|\]\.)?[ \t]*<\/ref>' ) # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(br'(?i)<meta[^>]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(r'(?i)charset\s*=\s*(?P<enc>[^\'",;>/]*)') # Extract html title from page #self.TITLE = re.compile(r'(?is)(?<=<title>).*?(?=</title>)') self.TITLE = re.compile( r'(?is)(<title[^>]*?>)(?P<title>.*?)(?=</title>)') # Matches content inside <script>/<style>/HTML comments self.NON_HTML = re.compile( br'(?is)<script[^>]*>.*?</script>|<style[^>]*>.*?</style>|' br'<!--.*?-->|<!\[CDATA\[.*?\]\]>') # Extract html language from page self.LANG = re.compile( r'(?i)(<html[^>]*?lang\s*?=\s*?|<meta\s*?HTTP-EQUIV\s*?=\s*?\"Content-Language\"\s*?CONTENT\s*?=\s*?|<meta property\s*?=\s*?\"og:locale\"\s*?content\s*?=\s*?)\"(?P<lang>.*?)[\_\-\"]' ) # Authorized mime types for HTML pages self.MIME = re.compile( r'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')