def _wikipedia_Page_getEditPage(self,
                                get_redirect=False,
                                throttle=True,
                                sysop=False,
                                oldid=None,
                                nofollow_redirects=False):
    """ Gets the source of a wiki page through WikiProxy
        TODO: finish (use permalink things in localhost/~daniel/WikiSense/WikiProxy.php
    """
    isWatched = False  # mja, hoe gaan we dat checken?
    editRestriction = None  # toolserver.getEditRestriction etc.

    if not oldid:
        oldid = self.latestRevision()

    data = {
        'wiki': self.site().hostname(),
        'title': self.sectionFreeTitle(),
        'rev': oldid
    }

    if wikipedia.verbose:
        wikipedia.output(
            u'Getting revision %(rev)i of page %(title)s from %(wiki)s' % data)
    path = 'http://localhost/~daniel/WikiSense/WikiProxy.php'

    f = urllib.urlopen(path, urllib.urlencode(data))
    if (throttle and not ('x-wikiproxy' in f.headers
                          and f.headers['x-wikiproxy'] == 'hit')):
        wikipedia.get_throttle()

    return (f.read(), False, None)
Ejemplo n.º 2
0
    def exportPage(self, page):
        response = None
        data = None
        wp = wikipedia.getSite(code=u'en', fam=u'wikipedia')
        address = wp.export_address()
        title = page.sectionFreeTitle().encode(wp.encoding())
        predata = {
            'action': 'submit',
            'pages': title,
            'offset': '1',
        }
        #if True is True:#Future Loop marker
        while True:
            wikipedia.get_throttle()
            wikipedia.output(
                '\03{lightpurple}>>\03{default} \03{lightaqua}Exporting revisions.\03{default}'
            )
            # Now make the actual request to the server
            now = time()
            if wp.hostname() in config.authenticate.keys():
                predata["Content-type"] = "application/x-www-form-urlencoded"
                predata["User-agent"] = wikipedia.useragent
                data = wp.urlEncode(predata)
                response = urllib2.urlopen(
                    urllib2.Request(
                        wp.protocol() + '://' + wp.hostname() + address, data))
                data = response.read()
            else:
                response, data = wp.postForm(address, predata)
            data = data.encode(wp.encoding())
            wikipedia.get_throttle.setDelay(time() - now)

            doc = minidom.parseString(data)
            revs = doc.getElementsByTagName('revision')
            revCount = len(revs)
            if revCount > 0:
                lastRev = revs[len(revs) -
                               1].getElementsByTagName('timestamp')[0]
                timestamp = ''
                for nodes in lastRev.childNodes:
                    if nodes.nodeType == Node.TEXT_NODE:
                        timestamp += nodes.data
                wikipedia.output(
                    '\03{lightpurple}>>\03{default} \03{lightaqua}Got %s revisions up to %s.\03{default}'
                    % (revCount, timestamp))
                fileName = 'wpdumps/%s-%s.xml' % (title.replace(
                    '/', '-'), predata['offset'].replace(':', '-'))
                wikipedia.output(
                    '\03{lightpurple}>>\03{default} \03{lightblue}Saving to %s.\03{default}'
                    % fileName)
                f = open(fileName, 'w')
                f.write(data)
                f.close()
                predata['offset'] = timestamp
            else:
                wikipedia.output(
                    '\03{lightpurple}>>\03{default} \03{lightaqua}Returned no revisions, exporting for this page is complete.\03{default}'
                )
                break
Ejemplo n.º 3
0
	def exportPage(self, page):
		response = None
		data = None
		wp = wikipedia.getSite(code=u'en', fam=u'wikipedia')
		address = wp.export_address()
		title = page.sectionFreeTitle().encode(wp.encoding())
		predata = {
			'action': 'submit',
			'pages': title,
			'offset': '1',
		}
		#if True is True:#Future Loop marker
		while True:
			wikipedia.get_throttle()
			wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Exporting revisions.\03{default}')
			# Now make the actual request to the server
			now = time()
			if wp.hostname() in config.authenticate.keys():
				predata["Content-type"] = "application/x-www-form-urlencoded"
				predata["User-agent"] = wikipedia.useragent
				data = wp.urlEncode(predata)
				response = urllib2.urlopen(urllib2.Request(wp.protocol() + '://' + wp.hostname() + address, data))
				data = response.read()
			else:
				response, data = wp.postForm(address, predata)
			data = data.encode(wp.encoding())
			wikipedia.get_throttle.setDelay(time() - now)
			
			doc = minidom.parseString(data)
			revs = doc.getElementsByTagName('revision')
			revCount = len(revs)
			if revCount > 0:
				lastRev = revs[len(revs)-1].getElementsByTagName('timestamp')[0]
				timestamp = ''
				for nodes in lastRev.childNodes:
					if nodes.nodeType == Node.TEXT_NODE:
						timestamp += nodes.data
				wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Got %s revisions up to %s.\03{default}' % (revCount,timestamp))
				fileName = 'wpdumps/%s-%s.xml' % (title.replace('/','-'),predata['offset'].replace(':','-'))
				wikipedia.output('\03{lightpurple}>>\03{default} \03{lightblue}Saving to %s.\03{default}' % fileName)
				f = open(fileName, 'w')
				f.write(data)
				f.close()
				predata['offset'] = timestamp
			else:
				wikipedia.output('\03{lightpurple}>>\03{default} \03{lightaqua}Returned no revisions, exporting for this page is complete.\03{default}')
				break
def _wikipedia_Page_getEditPage(self, get_redirect=False, throttle=True, sysop=False, oldid=None, nofollow_redirects=False):
    """ Gets the source of a wiki page through WikiProxy
        TODO: finish (use permalink things in localhost/~daniel/WikiSense/WikiProxy.php
    """
    isWatched = False # mja, hoe gaan we dat checken?
    editRestriction = None # toolserver.getEditRestriction etc.
    
    if not oldid:
        oldid = self.latestRevision()
    
    data = {'wiki'  : self.site().hostname(),
            'title' : self.sectionFreeTitle(),
            'rev'   : oldid
           }
            
    if wikipedia.verbose:
        wikipedia.output(u'Getting revision %(rev)i of page %(title)s from %(wiki)s' % data)
    path = 'http://localhost/~daniel/WikiSense/WikiProxy.php'
    
    f = urllib.urlopen(path, urllib.urlencode(data))
    if (throttle and not ('x-wikiproxy' in f.headers and f.headers['x-wikiproxy'] == 'hit')):
        wikipedia.get_throttle()
    
    return (f.read(), False, None)
Ejemplo n.º 5
0
    def _oldParseCategory(self, purge=False, startFrom=None):
        """Yields all articles and subcategories that are in this category.

        Set purge to True to instruct MediaWiki not to serve a cached version.

        Set startFrom to a string which is the title of the page to start from.

        Yielded results are tuples in the form (tag, page) where tag is one
        of the constants ARTICLE and SUBCATEGORY, and title is the Page or
        Category object.

        Note that results of this method need not be unique.

        This should not be used outside of this module.

        """
        if self.site().versionnumber() < 4:
            Rtitle = re.compile('title\s?=\s?"([^"]*)"')
        elif self.site().versionnumber() < 8:
            # FIXME seems to parse all links
            Rtitle = re.compile('/\S*(?: title\s?=\s?)?"([^"]*)"')
        else:
            Rtitle = re.compile('<li>(?:<span.*?>)?<a href=".*?"\s?title\s?=\s?"' '([^"]*)"\>\+?[^\<\+]')
        if self.site().versionnumber() < 8:
            Rsubcat = None
            Rimage = None
        else:
            Rsubcat = re.compile('CategoryTreeLabelCategory"\s?href=".+?">(.+?)</a>')
            Rimage = re.compile(
                '<div class\s?=\s?"thumb"\sstyle="[^"]*">'
                '(?:<div style="[^"]*">)?<a href=".*?"'
                '(?:\sclass="image")?\stitle\s?=\s?"([^"]*)"'
            )
        # regular expression matching the "(next 200)" link
        RLinkToNextPage = re.compile('&amp;from=(.*?)" title="')

        if startFrom:
            currentPageOffset = urllib.quote(startFrom.encode(self.site().encoding()))
        else:
            currentPageOffset = None
        while True:
            path = self.site().get_address(self.urlname())
            if purge:
                path += "&action=purge"
            if currentPageOffset:
                path += "&from=" + currentPageOffset
                pywikibot.output(
                    "Getting [[%s]] starting at %s..."
                    % (self.title(), pywikibot.url2link(currentPageOffset, self.site(), self.site()))
                )
            else:
                pywikibot.output("Getting [[%s]]..." % self.title())
            pywikibot.get_throttle()
            txt = self.site().getUrl(path)
            # index where subcategory listing begins
            if self.site().versionnumber() >= 9:
                # These IDs were introduced in 1.9
                if '<div id="mw-subcategories">' in txt:
                    ibegin = txt.index('<div id="mw-subcategories">')
                elif '<div id="mw-pages">' in txt:
                    ibegin = txt.index('<div id="mw-pages">')
                elif '<div id="mw-category-media">' in txt:
                    ibegin = txt.index('<div id="mw-category-media">')
                else:
                    # No pages
                    return
            else:
                # does not work for cats without text
                ibegin = txt.index("<!-- start content -->")
                # TODO: This parses category text and may think they are
                # pages in category! Check for versions before 1.9

            # index where article listing ends
            if '<div class="printfooter">' in txt:
                iend = txt.index('<div class="printfooter">')
            elif '<div class="catlinks">' in txt:
                iend = txt.index('<div class="catlinks">')
            else:
                iend = txt.index("<!-- end content -->")
            txt = txt[ibegin:iend]
            for title in Rtitle.findall(txt):
                if title == self.title():
                    # This is only a link to "previous 200" or "next 200".
                    # Ignore it.
                    pass
                # For MediaWiki versions where subcats look like articles
                elif isCatTitle(title, self.site()):
                    yield SUBCATEGORY, Category(self.site(), title)
                else:
                    yield ARTICLE, pywikibot.Page(self.site(), title)
            if Rsubcat:
                # For MediaWiki versions where subcats look differently
                for titleWithoutNamespace in Rsubcat.findall(txt):
                    title = "Category:%s" % titleWithoutNamespace
                    yield SUBCATEGORY, Category(self.site(), title)
            if Rimage:
                # For MediaWiki versions where images work through galleries
                for title in Rimage.findall(txt):
                    # In some MediaWiki versions, the titles contain the
                    # namespace, but they don't in other (newer) versions. Use
                    # the ImagePage's defaultNamespace feature to get everything
                    # correctly.
                    yield ARTICLE, pywikibot.ImagePage(self.site(), title)
            # try to find a link to the next list page
            matchObj = RLinkToNextPage.search(txt)
            if matchObj:
                currentPageOffset = matchObj.group(1)
            else:
                break
Ejemplo n.º 6
0
    def _parseCategory(self, purge=False, startFrom=None, sortby=None, sortdir=None, endsort=None):
        """
        Yields all articles and subcategories that are in this category by API.

        Set startFrom to a string which is the title of the page to start from.

        Yielded results are tuples in the form (tag, page) where tag is one
        of the constants ARTICLE and SUBCATEGORY, and title is the Page or
        Category object.

        Note that results of this method need not be unique.

        This should not be used outside of this module.

        """
        if not self.site().has_api() or self.site().versionnumber() < 11:
            for tag, page in self._oldParseCategory(purge, startFrom):
                yield tag, page
            return

        currentPageOffset = None
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": self.title(),
            "cmprop": ["title", "ids", "sortkey", "timestamp"],
            #'': '',
        }
        if self.site().versionnumber() > 16:
            params["cmprop"].append("sortkeyprefix")
        if sortby:
            params["cmsort"] = sortby
        if sortdir:
            params["cmdir"] = sortdir
        while True:
            if pywikibot.config.special_page_limit > 500:
                params["cmlimit"] = 500
            else:
                params["cmlimit"] = pywikibot.config.special_page_limit

            if currentPageOffset:
                params.update(currentPageOffset)
                pywikibot.output(
                    "Getting [[%s]] list from %s..." % (self.title(), "%s=%s" % currentPageOffset.popitem())
                )
            else:
                msg = "Getting [[%s]] list" % self.title()
                # category sort keys are uppercase
                if startFrom:
                    startFrom = startFrom.upper()
                    params["cmstartsortkey"] = startFrom
                    msg += " starting at %s" % startFrom
                if endsort:
                    endsort = endsort.upper()
                    params["cmendsortkey"] = endsort
                    msg += " ending at %s" % endsort
                pywikibot.output(msg + u"...")

            pywikibot.get_throttle()
            data = query.GetData(params, self.site())
            if "error" in data:
                raise RuntimeError("%s" % data["error"])
            count = 0

            for memb in data["query"]["categorymembers"]:
                count += 1
                # For MediaWiki versions where subcats look like articles
                if memb["ns"] == 14:
                    if "sortkeyprefix" in memb:
                        sortKeyPrefix = memb["sortkeyprefix"]
                    else:
                        sortKeyPrefix = None
                    yield SUBCATEGORY, Category(
                        self.site(), memb["title"], sortKey=memb["sortkey"], sortKeyPrefix=sortKeyPrefix
                    )
                elif memb["ns"] == 6:
                    yield ARTICLE, pywikibot.ImagePage(self.site(), memb["title"])
                else:
                    page = pywikibot.Page(self.site(), memb["title"], defaultNamespace=memb["ns"])
                    if "sortkeyprefix" in memb:
                        page.sortkeyprefix = memb["sortkeyprefix"]
                    else:
                        page.sortkeyprefix = None
                    yield ARTICLE, page
                if count >= params["cmlimit"]:
                    break
            # try to find a link to the next list page
            if "query-continue" in data and count < params["cmlimit"]:
                currentPageOffset = data["query-continue"]["categorymembers"]
            else:
                break
Ejemplo n.º 7
0
def refresh(site, sysop=False, witheditsonly=True):
    #if not site.has_api() or site.versionnumber() < 10:
    #    _refreshOld(site)

    # get botlist special page's URL
    if not site.loggedInAs(sysop=sysop):
        site.forceLogin(sysop=sysop)

    params = {
        'action': 'query',
        'list': 'allusers',
        'augroup': 'bot',
    }
    if witheditsonly:
        params['auwitheditsonly'] = ''

    pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site))
    botlist = []
    while True:
        pywikibot.get_throttle()
        data = pywikibot.query.GetData(params, site, sysop=sysop)
        if 'error' in data:
            raise RuntimeError('ERROR: %s' % data)
        botlist.extend([w['name'] for w in data['query']['allusers']])

        if 'query-continue' in data:
            params.update(data['query-continue']['allusers'])
        else:
            break

    pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site))
    m1 = True
    offset = ''
    if site.live_version()[1] >= 18:
        PATTERN = u'<li><a.*?>(.*?)</.*?> *\((.*?),\s(.*?)\)(?:.*?)</li>'
    elif site.live_version()[1] == 17:
        PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)(?:.*?)</li>'
    else:
        PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)</li>'
    while m1:
        pywikibot.get_throttle()
        text = site.getUrl(
            site.globalusers_address(offset=urllib.quote(offset),
                                     group='Global_bot'))

        m1 = re.findall(u'<li>.*?</li>', text)
        for item in m1:
            m2 = re.search(PATTERN, item)
            (bot, flag_local, flag_global) = m2.groups()
            flag_local = (flag_local[:2] == u'<a')
            flag_global = True  # since group='Global_bot'

            if bot not in botlist:
                botlist.append(bot)

        #print len(botlist)
        offset = bot.encode(site.encoding())

    # Save the botlist to disk
    # The file is stored in the botlists subdir. Create if necessary.
    if sysop:
        f = open(
            pywikibot.config.datafilepath(
                'botlists',
                'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)),
            'w')
    else:
        f = open(
            pywikibot.config.datafilepath(
                'botlists',
                'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
    pickle.dump(botlist, f)
    f.close()
Ejemplo n.º 8
0
    def _oldParseCategory(self, purge=False, startFrom=None):
        """Yields all articles and subcategories that are in this category.

        Set purge to True to instruct MediaWiki not to serve a cached version.

        Set startFrom to a string which is the title of the page to start from.

        Yielded results are tuples in the form (tag, page) where tag is one
        of the constants ARTICLE and SUBCATEGORY, and title is the Page or
        Category object.

        Note that results of this method need not be unique.

        This should not be used outside of this module.

        """
        if self.site().versionnumber() < 4:
            Rtitle = re.compile('title\s?=\s?\"([^\"]*)\"')
        elif self.site().versionnumber() < 8:
            # FIXME seems to parse all links
            Rtitle = re.compile('/\S*(?: title\s?=\s?)?\"([^\"]*)\"')
        else:
            Rtitle = re.compile(
                '<li>(?:<span.*?>)?<a href=\".*?\"\s?title\s?=\s?\"'
                '([^\"]*)\"\>\+?[^\<\+]')
        if self.site().versionnumber() < 8:
            Rsubcat = None
            Rimage = None
        else:
            Rsubcat = re.compile(
                'CategoryTreeLabelCategory\"\s?href=\".+?\">(.+?)</a>')
            Rimage = re.compile(
                '<div class\s?=\s?\"thumb\"\sstyle=\"[^\"]*\">'
                '(?:<div style=\"[^\"]*\">)?<a href=\".*?\"'
                '(?:\sclass="image")?\stitle\s?=\s?\"([^\"]*)\"')
        # regular expression matching the "(next 200)" link
        RLinkToNextPage = re.compile('&amp;from=(.*?)" title="')

        if startFrom:
            currentPageOffset = urllib.quote(
                startFrom.encode(self.site().encoding()))
        else:
            currentPageOffset = None
        while True:
            path = self.site().get_address(self.urlname())
            if purge:
                path += '&action=purge'
            if currentPageOffset:
                path += '&from=' + currentPageOffset
                pywikibot.output(
                    'Getting [[%s]] starting at %s...' %
                    (self.title(),
                     pywikibot.url2link(currentPageOffset, self.site(),
                                        self.site())))
            else:
                pywikibot.output('Getting [[%s]]...' % self.title())
            pywikibot.get_throttle()
            txt = self.site().getUrl(path)
            # index where subcategory listing begins
            if self.site().versionnumber() >= 9:
                # These IDs were introduced in 1.9
                if '<div id="mw-subcategories">' in txt:
                    ibegin = txt.index('<div id="mw-subcategories">')
                elif '<div id="mw-pages">' in txt:
                    ibegin = txt.index('<div id="mw-pages">')
                elif '<div id="mw-category-media">' in txt:
                    ibegin = txt.index('<div id="mw-category-media">')
                else:
                    # No pages
                    return
            else:
                # does not work for cats without text
                ibegin = txt.index('<!-- start content -->')
                # TODO: This parses category text and may think they are
                # pages in category! Check for versions before 1.9

            # index where article listing ends
            if '<div class="printfooter">' in txt:
                iend = txt.index('<div class="printfooter">')
            elif '<div class="catlinks">' in txt:
                iend = txt.index('<div class="catlinks">')
            else:
                iend = txt.index('<!-- end content -->')
            txt = txt[ibegin:iend]
            for title in Rtitle.findall(txt):
                if title == self.title():
                    # This is only a link to "previous 200" or "next 200".
                    # Ignore it.
                    pass
                # For MediaWiki versions where subcats look like articles
                elif isCatTitle(title, self.site()):
                    yield SUBCATEGORY, Category(self.site(), title)
                else:
                    yield ARTICLE, pywikibot.Page(self.site(), title)
            if Rsubcat:
                # For MediaWiki versions where subcats look differently
                for titleWithoutNamespace in Rsubcat.findall(txt):
                    title = 'Category:%s' % titleWithoutNamespace
                    yield SUBCATEGORY, Category(self.site(), title)
            if Rimage:
                # For MediaWiki versions where images work through galleries
                for title in Rimage.findall(txt):
                    # In some MediaWiki versions, the titles contain the
                    # namespace, but they don't in other (newer) versions. Use
                    # the ImagePage's defaultNamespace feature to get everything
                    # correctly.
                    yield ARTICLE, pywikibot.ImagePage(self.site(), title)
            # try to find a link to the next list page
            matchObj = RLinkToNextPage.search(txt)
            if matchObj:
                currentPageOffset = matchObj.group(1)
            else:
                break
Ejemplo n.º 9
0
    def _parseCategory(self,
                       purge=False,
                       startFrom=None,
                       sortby=None,
                       sortdir=None,
                       endsort=None):
        """
        Yields all articles and subcategories that are in this category by API.

        Set startFrom to a string which is the title of the page to start from.

        Yielded results are tuples in the form (tag, page) where tag is one
        of the constants ARTICLE and SUBCATEGORY, and title is the Page or
        Category object.

        Note that results of this method need not be unique.

        This should not be used outside of this module.

        """
        if not self.site().has_api() or self.site().versionnumber() < 11:
            for tag, page in self._oldParseCategory(purge, startFrom):
                yield tag, page
            return

        currentPageOffset = None
        params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': self.title(),
            'cmprop': ['title', 'ids', 'sortkey', 'timestamp'],
            #'': '',
        }
        if self.site().versionnumber() > 16:
            params['cmprop'].append('sortkeyprefix')
        if sortby:
            params['cmsort'] = sortby
        if sortdir:
            params['cmdir'] = sortdir
        while True:
            if pywikibot.config.special_page_limit > 500:
                params['cmlimit'] = 500
            else:
                params['cmlimit'] = pywikibot.config.special_page_limit

            if currentPageOffset:
                params.update(currentPageOffset)
                pywikibot.output(
                    'Getting [[%s]] list from %s...' %
                    (self.title(), "%s=%s" % currentPageOffset.popitem()))
            else:
                msg = 'Getting [[%s]] list' % self.title()
                # category sort keys are uppercase
                if startFrom:
                    startFrom = startFrom.upper()
                    params['cmstartsortkey'] = startFrom
                    msg += ' starting at %s' % startFrom
                if endsort:
                    endsort = endsort.upper()
                    params['cmendsortkey'] = endsort
                    msg += ' ending at %s' % endsort
                pywikibot.output(msg + u'...')

            pywikibot.get_throttle()
            data = query.GetData(params, self.site())
            if 'error' in data:
                raise RuntimeError("%s" % data['error'])
            count = 0

            for memb in data['query']['categorymembers']:
                count += 1
                # For MediaWiki versions where subcats look like articles
                if memb['ns'] == 14:
                    if 'sortkeyprefix' in memb:
                        sortKeyPrefix = memb['sortkeyprefix']
                    else:
                        sortKeyPrefix = None
                    yield SUBCATEGORY, Category(self.site(),
                                                memb['title'],
                                                sortKey=memb['sortkey'],
                                                sortKeyPrefix=sortKeyPrefix)
                elif memb['ns'] == 6:
                    yield ARTICLE, pywikibot.ImagePage(self.site(),
                                                       memb['title'])
                else:
                    page = pywikibot.Page(self.site(),
                                          memb['title'],
                                          defaultNamespace=memb['ns'])
                    if 'sortkeyprefix' in memb:
                        page.sortkeyprefix = memb['sortkeyprefix']
                    else:
                        page.sortkeyprefix = None
                    yield ARTICLE, page
                if count >= params['cmlimit']:
                    break
            # try to find a link to the next list page
            if 'query-continue' in data and count < params['cmlimit']:
                currentPageOffset = data['query-continue']['categorymembers']
            else:
                break
Ejemplo n.º 10
0
def refresh(site, sysop=False, witheditsonly=True):
    #if not site.has_api() or site.versionnumber() < 10:
    #    _refreshOld(site)

    # get botlist special page's URL
    if not site.loggedInAs(sysop=sysop):
        site.forceLogin(sysop=sysop)

    params = {
        'action': 'query',
        'list': 'allusers',
        'augroup': 'bot',
    }
    if witheditsonly:
        params['auwitheditsonly'] = ''

    pywikibot.output(u'Retrieving bot user list for %s via API.' % repr(site))
    botlist = []
    while True:
        pywikibot.get_throttle()
        data = pywikibot.query.GetData(params, site, sysop=sysop)
        if 'error' in data:
            raise RuntimeError('ERROR: %s' % data)
        botlist.extend([w['name'] for w in data['query']['allusers']])

        if 'query-continue' in data:
            params.update(data['query-continue']['allusers'])
        else:
            break

    pywikibot.output(u'Retrieving global bot user list for %s.' % repr(site))
    m1 = True
    offset = ''
    if   site.live_version()[1] >= 18:
        PATTERN = u'<li><a.*?>(.*?)</.*?> *\((.*?),\s(.*?)\)(?:.*?)</li>'
    elif site.live_version()[1] == 17:
        PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)(?:.*?)</li>'
    else:
        PATTERN = u'<li>(.*?) *\((.*?),\s(.*?)\)</li>'
    while m1:
        pywikibot.get_throttle()
        text = site.getUrl(site.globalusers_address(offset=urllib.quote(offset), group='Global_bot'))

        m1 = re.findall(u'<li>.*?</li>', text)
        for item in m1:
            m2 = re.search(PATTERN, item)
            (bot, flag_local, flag_global) = m2.groups()
            flag_local  = (flag_local[:2] == u'<a')
            flag_global = True # since group='Global_bot'

            if bot not in botlist:
                botlist.append( bot )

        #print len(botlist)
        offset = bot.encode(site.encoding())

    # Save the botlist to disk
    # The file is stored in the botlists subdir. Create if necessary.
    if sysop:
        f = open(pywikibot.config.datafilepath('botlists',
                 'botlist-%s-%s-sysop.dat' % (site.family.name, site.lang)), 'w')
    else:
        f = open(pywikibot.config.datafilepath('botlists',
                 'botlist-%s-%s.dat' % (site.family.name, site.lang)), 'w')
    pickle.dump(botlist, f)
    f.close()
Ejemplo n.º 11
0
    def _parseCategory(self, purge=False, startFrom=None, sortby=None, sortdir=None):
        """
        Yields all articles and subcategories that are in this category by API.

        Set startFrom to a string which is the title of the page to start from.

        Yielded results are tuples in the form (tag, page) where tag is one
        of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category
        object.

        Note that results of this method need not be unique.

        This should not be used outside of this module.
        """
        if not self.site().has_api() or self.site().versionnumber() < 11:
            for tag, page in self._oldParseCategory(purge, startFrom):
                yield tag, page
            return

        currentPageOffset = None
        params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': self.title(),
            'cmprop': ['title', 'ids', 'sortkey', 'timestamp'],
            #'': '',
        }
        if sortby:
            params['cmsort'] = sortby
        if sortdir:
            params['cmdir'] = sortdir
        while True:
            if wikipedia.config.special_page_limit > 500:
                params['cmlimit'] = 500
            else:
                params['cmlimit'] = wikipedia.config.special_page_limit

            if currentPageOffset:
                params.update(currentPageOffset)
                wikipedia.output('Getting [[%s]] list from %s...'
                                 % (self.title(), "%s=%s" % currentPageOffset.popitem()))
            elif startFrom:
                startFrom = startFrom.upper() # category sort keys are uppercase
                params['cmstartsortkey'] = startFrom
                wikipedia.output('Getting [[%s]] list starting at %s...'
                                 % (self.title(), startFrom))
            else:
                wikipedia.output('Getting [[%s]]...' % self.title())

            wikipedia.get_throttle()
            data = query.GetData(params, self.site())
            if 'error' in data:
                raise RuntimeError("%s" % data['error'])
            count = 0

            for memb in data['query']['categorymembers']:
                count += 1
                # For MediaWiki versions where subcats look like articles
                if memb['ns'] == 14:
                    yield SUBCATEGORY, Category(self.site(), memb['title'], sortKey=memb['sortkey'])
                elif memb['ns'] == 6:
                    yield ARTICLE, wikipedia.ImagePage(self.site(), memb['title'])
                else:
                    yield ARTICLE, wikipedia.Page(self.site(), memb['title'], defaultNamespace=memb['ns'])
                if count >= params['cmlimit']:
                    break
            # try to find a link to the next list page
            if 'query-continue' in data and count < params['cmlimit']:
                currentPageOffset = data['query-continue']['categorymembers']
            else:
                break
Ejemplo n.º 12
0
    def _parseCategory(self, purge=False, startFrom=None):
        """
        Yields all articles and subcategories that are in this category by API.

        Set startFrom to a string which is the title of the page to start from.

        Yielded results are tuples in the form (tag, page) where tag is one
        of the constants ARTICLE and SUBCATEGORY, and title is the Page or Category
        object.

        Note that results of this method need not be unique.

        This should not be used outside of this module.
        """
        if not self.site().has_api() or self.site().versionnumber() < 11:
            for tag, page in self._oldParseCategory(purge, startFrom):
                yield tag, page
            return

        currentPageOffset = None
        params = {
            'action': 'query',
            'list': 'categorymembers',
            'cmtitle': self.title(),
            'cmprop': ['title', 'ids', 'sortkey', 'timestamp'],
            #'': '',
        }
        while True:
            if wikipedia.config.special_page_limit > 500:
                params['cmlimit'] = 500
            else:
                params['cmlimit'] = wikipedia.config.special_page_limit

            if currentPageOffset:
                params['cmcontinue'] = currentPageOffset
                wikipedia.output(
                    'Getting [[%s]] list from %s...' %
                    (self.title(),
                     currentPageOffset[:-1]))  # cmcontinue last key is '|'
            elif startFrom:
                params['cmstartsortkey'] = startFrom
                wikipedia.output('Getting [[%s]] list starting at %s...' %
                                 (self.title(), startFrom))
            else:
                wikipedia.output('Getting [[%s]]...' % self.title())

            wikipedia.get_throttle()
            data = query.GetData(params, self.site())
            if 'error' in data:
                raise RuntimeError("%s" % data['error'])
            count = 0

            for memb in data['query']['categorymembers']:
                count += 1
                # For MediaWiki versions where subcats look like articles
                if memb['ns'] == 14:
                    yield SUBCATEGORY, Category(self.site(),
                                                memb['title'],
                                                sortKey=memb['sortkey'])
                elif memb['ns'] == 6:
                    yield ARTICLE, wikipedia.ImagePage(self.site(),
                                                       memb['title'])
                else:
                    yield ARTICLE, wikipedia.Page(self.site(),
                                                  memb['title'],
                                                  defaultNamespace=memb['ns'])
                if count >= params['cmlimit']:
                    break
            # try to find a link to the next list page
            if 'query-continue' in data and count < params['cmlimit']:
                currentPageOffset = data['query-continue']['categorymembers'][
                    'cmcontinue']
            else:
                break