Ejemplos de PagesFromTitlesGenerator en Python, ejemplos de pagegenerators.PagesFromTitlesGenerator en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: imageuncat.py Proyecto: XXN/pywikibot-compat

def recentChanges(site=None, delay=0, block=70):
    """
    Return a pagegenerator containing all the images edited in a certain
    timespan. The delay is the amount of minutes to wait and the block is the
    timespan to return images in. Should probably be copied to somewhere else.

    """

    result = []
    dateformat = "%Y-%m-%dT%H:%M:%SZ"
    rcstart = datetime.utcnow() + timedelta(minutes=-delay-block)
    rcend = datetime.utcnow() + timedelta(minutes=-delay)

    params = {
        'action':      'query',
        'list':        'recentchanges',
        'rcstart':     rcstart.strftime(dateformat),
        'rcend':       rcend.strftime(dateformat),
        'rcdir':       'newer',
        'rcnamespace': '6',
        'rcprop':      'title',
        'rcshow':      '!bot',
        'rclimit':     '5000',
        'rctype':      'edit|log',
    }

    data = query.GetData(params, site)
    try:
        for item in data['query']['recentchanges']:
            result.append(item['title'])
    except (IndexError, KeyError):
        raise NoPage(u'API Error, nothing found in the APIs')
    return pagegenerators.PagesFromTitlesGenerator(result, site)

Ejemplo n.º 2

0

Mostrar archivo

 def test_PageTitleFilterPageGenerator(self):
     self._check_member(pagegenerators,
                        "PageTitleFilterPageGenerator",
                        call=True)
     gen0 = pagegenerators.PagesFromTitlesGenerator(PAGE_SET_GENERIC)
     gen1 = pagegenerators.PageTitleFilterPageGenerator(
         gen0, self.ignore_list)
     self.assertTrue(len(PAGE_SET_GENERIC) > len(list(gen1)))

Ejemplo n.º 3

0

Mostrar archivo

def uploadedYesterday(site = None):
    '''
    Return a pagegenerator containing all the pictures uploaded yesterday.
    Should probably copied to somewhere else
    '''
    result = []
    dateformat ="%Y-%m-%dT00:00:00Z"
    today = datetime.utcnow()
    yesterday = today + timedelta(days=-1)

    for item in site.logpages( number = 5000, mode = 'upload', start = yesterday.strftime(dateformat),
                end = today.strftime(dateformat), newer = True, dump = True):
        result.append(item['title'])
    return pagegenerators.PagesFromTitlesGenerator(result, site)

Ejemplo n.º 4

0

Mostrar archivo

class AfDBot:
    # Edit summary message that should be used.
    msg = {
        'en':
        u'New section: /* [[Wikipedia:Articles for deletion|AfD]] nomination */ Notification',
    }

    def __init__(self, AfDlog, always, debug=False):
        """
        Constructor. Parameters:
            * AfDlog        - The AfD log to be treated.
            * always        - If True, the user won't be prompted before changes
                             are made.
            * debug         - If True, don't edit pages. Only show proposed
                             edits.
        """
        self.AfDlog = AfDlog
        self.always = always
        self.debug = debug
        self.site = AfDlog.site()
        self.db = None
        self.replag = None

        locale.setlocale(locale.LC_ALL, 'nl_NL.UTF-8')
        os.environ['TZ'] = 'Europe/Amsterdam'

    def run(self):
        # Set up database access
        try:
            self.db = querier.querier(host="nlwiki.labsdb")
        except Exception, error:
            wikipedia.output(u'Could not connect to database: %s.' % error,
                             toStdout=False)

        # Dictionaries of users with page_title and AfD_title tuple.
        self.contributors = {}

        if self.db:
            # Get replag
            sql = """
                    SELECT time_to_sec(timediff(now()+0,CAST(rev_timestamp AS int))) AS replag
                    FROM nlwiki_p.revision
                    ORDER BY rev_timestamp DESC
                    LIMIT 1;"""
            result = self.db.do(sql)

            if not result:
                wikipedia.output(
                    u'Could not get replag. Assuming it\'s infinite (= 1 month).'
                )
                self.replag = 30 * 25 * 3600
            else:
                self.replag = int(result[0]['replag'])
                wikipedia.output(u'Replag: %is.' % self.replag)

        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), self.msg))
        try:
            # Load the page
            text = self.AfDlog.get()
        except wikipedia.NoPage:
            wikipedia.output(u"Page %s does not exist; skipping." %
                             self.AfDlog.aslink())
            return
        except wikipedia.IsRedirectPage:
            wikipedia.output(u"Page %s is a redirect; skipping." %
                             self.AfDlog.aslink())
            return

        # Find AfD's
        pageR = re.compile(r'^\*[ ]*?\[\[(?P<page>.*?)(?:\|.*?\]\]|\]\])')
        timestampR = re.compile('(\d{1,2}) (.{3}) (\d{4}) (\d{2}):(\d{2})')
        userR = re.compile(
            r'\[\[(?:[Uu]ser|[Gg]ebruiker):(?P<user>.*?)(?:\|.*?\]\]|\]\])')
        strictTemplateR = re.compile(
            r'\{\{(?:[Uu]ser|[Gg]ebruiker):(?P<user>.*?)\/[Hh]andtekening\}\}')
        templateR = re.compile(
            r'\{\{(?:[Uu]ser|[Gg]ebruiker):(?P<user>.*?)\/.*?\}\}')
        pages = []
        lines = text.splitlines()
        for line in lines:
            mPage = pageR.search(line)
            mTimestamp = timestampR.search(line)
            if mTimestamp:
                t = time.strftime(
                    '%Y%m%d%H%M%S',
                    time.gmtime(
                        time.mktime(
                            time.strptime(mTimestamp.group(),
                                          '%d %b %Y %H:%M'))))
            else:
                t = None
            if mPage and userR.search(line):
                pages.append(
                    (mPage.group('page'), userR.search(line).group('user'), t))
                continue
            elif mPage and strictTemplateR.search(line):
                pages.append((mPage.group('page'),
                              strictTemplateR.search(line).group('user'), t))
                continue
            elif mPage and templateR.search(line):
                pages.append((mPage.group('page'),
                              templateR.search(line).group('user'), t))
                continue
            elif mPage:
                pages.append((mPage.group('page'), None, t))
                continue
        wikipedia.output(u'Found %i AfD\'s.' % len(pages))

        # Treat AfD's
        for p in pages:
            page = wikipedia.Page(self.site, p[0])
            nominator = p[1]
            timestamp = p[2]
            page_contributors = self.getcontributors(page, timestamp)

            for contributor in page_contributors:
                if not self.contributors.has_key(contributor):
                    self.contributors[contributor] = [(page.title(), nominator)
                                                      ]
                else:
                    self.contributors[contributor].append(
                        (page.title(), nominator))

        # Treat users
        wikipedia.output(u'\n\nFound %i unique users.' %
                         len(self.contributors))
        pages = []  # User talk pages
        for user in self.contributors.keys():
            pages.append(u'%s:%s' % (self.site.namespace(3), user))

        gen = pagegenerators.PagesFromTitlesGenerator(pages, site=self.site)
        gen = pagegenerators.PreloadingGenerator(gen)

        for page in gen:
            self.treatUser(page)

Ejemplo n.º 5

0

Mostrar archivo

    def test_sequence_and_buffering(self):
        """Test of sequence with buffering:
            gen0: PagesFromTitlesGenerator
             gen1: (PageTitleFilterPageGenerator)
              gen2: PreloadingGenerator
               [output]
        When enabling API be switching on debug mode:
            pywikibot.logging.getLogger().setLevel(pywikibot.DEBUG)
        buffering seams NOT TO WORK ANYMORE ... ?!?!!
        """

        gen0 = pagegenerators.PagesFromTitlesGenerator(PAGE_SET_GENERIC)
        gen1 = pagegenerators.PageTitleFilterPageGenerator(
            gen0, self.ignore_list)
        num = len(list(gen1))

        gen0 = pagegenerators.PagesFromTitlesGenerator(PAGE_SET_GENERIC)
        gen1 = pagegenerators.PageTitleFilterPageGenerator(
            gen0, self.ignore_list)
        gen2 = pagegenerators.PreloadingGenerator(
            gen1)  # ThreadedGenerator would be nice!

        # TODO: solve this API buffering (speed) issue !
        #        # to enable the use of the API here (seams to be slower... ?!?)
        #        pywikibot.logging.getLogger().setLevel(pywikibot.DEBUG)

        for page in gen2:
            buffd, unbuffd = {}, {}

            start = time.time()
            u = page.get()
            buffd['get'] = time.time() - start

            self.assertAlmostEqual(buffd['get'], 0., places=3)

            start = time.time()
            u = page.getVersionHistory(revCount=1)
            buffd['getVersionHistory'] = time.time() - start

            self.assertAlmostEqual(buffd['getVersionHistory'], 0., places=4)

            start = time.time()
            u = page.getSections(minLevel=1)
            unbuffd['getSections'] = time.time() - start

            start = time.time()
            u = page.getSections(minLevel=1)
            buffd['getSections'] = time.time() - start

            self.assertAlmostEqual(buffd['getSections'], 0., places=4)

            start = time.time()
            u = page.get(force=True)  # triggers reload of 'getSections' also
            unbuffd['get'] = time.time() - start

            start = time.time()
            u = page.getVersionHistory(revCount=1, forceReload=True)
            unbuffd['getVersionHistory'] = time.time() - start

            self.assertGreaterEqual(unbuffd['get'] / buffd['get'], 1E3)
            self.assertGreaterEqual(
                unbuffd['getVersionHistory'] / buffd['getVersionHistory'], 1E4)
            self.assertGreaterEqual(
                unbuffd['getSections'] / buffd['getSections'], 1E5)

            num -= 1

        self.assertEqual(num, 0)

Ejemplo n.º 6

0

Mostrar archivo

 def test_PagesFromTitlesGenerator(self):
     self._check_member(pagegenerators,
                        "PagesFromTitlesGenerator",
                        call=True)
     gen0 = pagegenerators.PagesFromTitlesGenerator(PAGE_SET_GENERIC)
     self.assertTrue(len(PAGE_SET_GENERIC) == len(list(gen0)))

Ejemplo n.º 7

0

Mostrar archivo

Archivo: tarea037.py Proyecto: edgarskos/toolserver

def main():
    """ Update popular articles lists """

    #excluded namespaces and stuff
    exclusions = [
        'http://', 'Special\:', 'sort[_ ]down\.gif', 'sort[_ ]up\.gif',
        'sort[_ ]none\.gif', '\&limit\='
    ]
    for lang in langs:
        for nm in wikipedia.Site(lang, "wikipedia").namespaces():
            exclusions += [
                re.sub(" ", "_", "%s\:" % nm),
                re.sub("_", " ", "%s\:" % nm),  #meter urllib.quote?
            ]
    exclusions_r = re.compile(r'(?im)(%s)' % ("|".join(sets.Set(exclusions))))

    wikipedia.output("Se van a analizar los idiomas: %s" % ', '.join(langs))
    totalvisits = analizarPageViewsLogs()
    #print totalvisits
    sortFiles()  #gnu sort
    compactar()
    sortByPageViews()  #ordenamos de mas visitas a menos, cada idioma

    if test:
        print "Fin de la prueba, chequea los archivos"
        sys.exit()

    #leemos las primeras y actualizamos el ranking
    for lang in langs:
        if tarea000.isExcluded('tarea037', 'wikipedia', lang):
            continue
        print '-' * 50, '\n', lang.upper(), '\n', '-' * 50
        f = codecs.open("/home/emijrp/temporal/tarea037-%s-sorted-times.txt" %
                        lang,
                        mode="r",
                        encoding="utf-8")
        pageselection = []
        pagesiter = []

        for line in f:
            line = line[:-1]
            times, pagelang, page = line.split(" ")
            if len(
                    pagesiter
            ) < limite * 5:  #margen de error, pueden no existir las paginas, aunque seria raro
                if page == '' or re.search(exclusions_r, page):
                    continue
                else:
                    pageselection.append([page, times])
                    pagesiter.append(page)
            else:
                break
        f.close()
        print "Elegidas", len(pageselection), "candidatas"
        if len(pagesiter) < limite:
            print "Hay menos de %d, que ha pasado? Siguiente wikipedia" % len(
                pagesiter)
            continue

        exitpage = u""
        if exitpages.has_key(lang):
            exitpage = exitpages[lang]
        else:
            exitpage = exitpages["default"]
        salida = u""

        projsite = wikipedia.Site(lang, 'wikipedia')
        watch = u'<div style="float: right;"><small>&#91;[[Special:RecentChangesLinked/{{FULLPAGENAME}}|watch popular articles]]&#93;</small></div>'
        intro = u"This page was generated at '''{{subst:#time:Y-m-d H:i}} (UTC)'''.\n\nTotal hits to [{{subst:SERVER}} {{subst:SERVERNAME}}] (including all pages): {{formatnum:%d}}.\n\n[[File:Padlock.svg|20px|Full protected]] = Full protected, [[File:Padlock-silver.svg|20px|Semi-protected]] = Semi-protected.\n\nSource: [http://dammit.lt/wikistats dammit.lt/wikistats]. More page views statistics: [http://stats.wikimedia.org/EN/TablesPageViewsMonthly.htm stats.wikimedia.org] and [http://stats.grok.se stats.grok.se].\n\n" % (
            totalvisits[lang])
        table = u"{| class=\"wikitable sortable\" style=\"text-align: center;\" \n! # !! Article !! Hits "
        if lang == 'es':
            salida = u"<noinclude>{{%s/begin|{{subst:CURRENTHOUR}}}}</noinclude>\n{| class=\"wikitable sortable\" style=\"text-align: center;\" width=350px \n|+ [[Plantilla:Artículos populares|Artículos populares]] en la última hora \n! # !! Artículo !! Visitas " % exitpage
        else:
            if hourly:
                #decir que hora es analizada
                gzhour = gzs[0][20:22]
                hour = (datetime.datetime(
                    year=2000, month=1, day=1, hour=int(gzhour)) -
                        datetime.timedelta(hours=1)
                        ).hour  #calcular la hora anterior, la ant a 0 es 23
                hour_ = str(hour)
                if hour < 10:
                    hour_ = '0' + hour_
                map = u'[[File:Daylight_Map,_nonscientific_(%s00_UTC).jpg|thumb|Daylight map, %s:00 (UTC)]]' % (
                    hour_, hour_)
                salida += watch + "\n" + map + "\n"
                salida += u"Last hour '''popular articles''' (Period: '''%s:00–%s:59 (UTC)'''). %s%s" % (
                    hour_, hour_, intro, table)
            else:
                #decir a que periodo de 24 horas se refiere el análisis
                gzhour1 = gzs[0][20:22]
                hour1 = (datetime.datetime(
                    year=2000, month=1, day=1, hour=int(gzhour1)) -
                         datetime.timedelta(hours=1)).hour
                hour1_ = str(hour1)
                if hour1 < 10:
                    hour1_ = '0' + hour1_
                gzhour2 = gzs[-1][20:22]
                hour2 = (datetime.datetime(
                    year=2000, month=1, day=1, hour=int(gzhour2)) -
                         datetime.timedelta(hours=1)).hour
                hour2_ = str(hour2)
                if hour2 < 10:
                    hour2_ = '0' + hour2_
                salida += watch + "\n"
                salida += u"Last 24 hours '''popular articles''' (Period: '''%s:00–%s:59 (UTC)'''). %s%s" % (
                    hour1_, hour2_, intro, table)

        #for p in pagesiter: #para ver que pagina fallaba con la codificación
        #    print p
        #    pp=wikipedia.Page(projsite, p)

        try:
            gen = pagegenerators.PagesFromTitlesGenerator(pagesiter, projsite)
        except:
            print "Error en la codificacion seguramente", lang
            continue
        pre = pagegenerators.PreloadingGenerator(gen,
                                                 pageNumber=limite * 2,
                                                 lookahead=100)
        c = d = ind = 0
        sum = 0
        for page in pre:
            detalles = u''
            if not page.exists():
                #si la página no existe es porque es una redirección por software (mediawiki)
                #o porque la han borrado o nunca existió... algún DDoS loco...
                #algunos ejemplos...
                #Recovery (Eminem album)
                #Glee (TV series)
                #PubMed Identifier
                try:
                    wikipedia.output(u"No existe? %s" % page.title())
                except:
                    print "No existe"
                page = getSoftwareRedirect(lang, page)
                r = 0
                while page.exists() and page.isRedirectPage():
                    r += 1
                    page = page.getRedirectTarget()
                    if r > 5:  #what?
                        break
                if not page.exists():
                    #sayonara baby, ddos, loco?
                    continue
            if page.exists() and page.namespace() == 0:
                c += 1
                sum += int(pageselection[ind][1])
                if c > limite:
                    break

                #debe ser lo primero que se añada a detalles, para que el candado salga junto al título
                locks = page.getRestrictions()
                #formato de locks
                #{u'edit': None, 'move': None}
                #{u'edit': [u'autoconfirmed', u'2010-09-10T20:08:39Z']}
                #{u'edit': [u'autoconfirmed', u'infinity'], u'move': [u'autoconfirmed', u'infinity']}
                if locks.has_key("edit") and locks["edit"]:
                    if locks["edit"][0] == "autoconfirmed":
                        detalles += u'[[File:Padlock-silver.svg|15px|Semi-protected]] '
                    elif locks["edit"][0] == "sysop":
                        detalles += u'[[File:Padlock.svg|15px|Full-protected]] '

                wtitle = page.title()
                page2 = page  #para coger el redirecttarget si es redirect, se usa más abajo también para los interwikis
                if page.isRedirectPage():
                    page2 = page.getRedirectTarget()
                    detalles += u' (#REDIRECT [[%s]]) ' % (page2.title())
                elif page.isDisambig():
                    #detalles+=u'(Desambiguación) '
                    pass  #para evitar no ponerlo en el idioma loal
                else:
                    pass
                    #tmpget=page.get()
                    #if re.search(ur'(?i)\{\{ *Artículo bueno', tmpget):
                    #    detalles+='[[Image:Artículo bueno.svg|14px|Artículo bueno]]'
                    #if re.search(ur'(?i)\{\{ *(Artículo destacado|Zvezdica)', tmpget):
                    #    detalles+='[[Image:Cscr-featured.svg|14px|Featured article]]'
                    #if re.search(ur'(?i)\{\{ *(Semiprotegida2?|Semiprotegido|Pp-semi-template)', tmpget):
                    #    detalles+='[[Image:Padlock-silver-medium.svg|20px|Semiprotegida]]'
                    #if re.search(ur'(?i)\{\{ *(Protegida|Protegido|Pp-template)', tmpget):
                    #    detalles+='[[Image:Padlock.svg|20px|Protegida]]'
                #wikipedia.output('%s - %d - %s' % (wtitle, visits, detalles))
                #continue

                #quitamos enlaces a #secciones y nos quedamos con la primera parte
                wtitle = wtitle.split("#")[0]

                if page.namespace() in [6, 14]:
                    wtitle = u':%s' % wtitle

                if lang == 'es':
                    if c - 1 in [3, 5, 10, 15, 20]:
                        salida += u"\n{{#ifexpr:{{{top|15}}} > %d|" % (c - 1)
                        d += 1
                    salida += u"\n{{!}}-\n{{!}} %d {{!}}{{!}} [[%s]]%s{{#if:{{{novistas|}}}||{{!}}{{!}} {{formatnum:%s}}}} " % (
                        c, wtitle, detalles, pageselection[ind][1])
                else:
                    #english interwiki <sup>
                    iwlink = ""
                    if lang != "en":
                        #a veces falla al cargar iws vacios de portadas del tipo [[cs:]]
                        # puede haber una excepción SectionError, entre otras
                        try:
                            iws = page2.interwiki()
                            for iw in iws:
                                if iwlink == '' and iw.site().lang == "en":
                                    iwlink = " <sup>([[:en:%s|en]])</sup>" % (
                                        iw.title())
                        except:
                            pass
                    salida += u"\n|-\n| %d || [[%s]]%s%s || {{formatnum:%s}} " % (
                        c, wtitle, detalles, iwlink, pageselection[ind][1])

                #except:
                #    wikipedia.output(u'Error al generar item en lista de %s:' % lang)
            ind += 1  #se incrementa siempre, para que no se desplace la columna de visitas, no ponerlo al principio, empieza en 0

        iws = u''
        for iw in alllangs:
            if iw != lang:
                if exitpages.has_key(iw):
                    iws += u'[[%s:%s]]\n' % (iw, exitpages[iw])
                else:
                    iws += u'[[%s:%s]]\n' % (iw, exitpages["default"])
        #salida+="\n{{/end}}\n%s" % (iws)
        if lang == 'es':
            salida += u"\n%s\n{{%s/end|%d|%d|top={{{top|15}}}|fecha={{subst:CURRENTTIME}} ([[UTC]]) del {{subst:CURRENTDAY2}}/{{subst:CURRENTMONTH}}/{{subst:CURRENTYEAR}}}}\n|}\n<noinclude>{{documentación}}\n%s</noinclude>" % (
                "}} " * d, exitpage, sum, totalvisits[lang], iws)
        else:
            salida += u"\n|-\n| &nbsp; || '''Top %d hit sum''' || '''{{formatnum:%d}}''' \n|}\n\n%s" % (
                limite, sum, iws)
        #wikipedia.output(re.sub(ur"\n", ur" ", salida))
        if len(salida) > 3000:
            wiii = wikipedia.Page(projsite, exitpage)
            #wiii.put(salida, u'BOT - Updating list')
        else:
            print "Error pagina menor de 3KB, fallo algo"
        os.system("rm /home/emijrp/temporal/tarea037-%s.txt" % lang)
        os.system("rm /home/emijrp/temporal/tarea037-%s-compacted.txt" % lang)
        os.system("rm /home/emijrp/temporal/tarea037-%s-sorted-page.txt" %
                  lang)
        #os.system("rm /home/emijrp/temporal/tarea037-%s-sorted-times.txt" % lang)
        if not random.randint(0, 9) or daily:
            #lo renovamos cada 10 ejecuciones más o menos si es el ranking por horas y siempre si es el diario
            filepagetitles = "/home/emijrp/temporal/tarea037-%s-pagetitles.txt" % lang
            #a lo mejor no se creó porque no hizo falta
            if os.path.exists(filepagetitles):
                os.remove(filepagetitles)

Ejemplo n.º 8

0

Mostrar archivo

site = wikipedia.Site("es", "wikipedia")

data = site.getUrl(
    "/w/index.php?title=Especial:WhatLinksHere/Plantilla:Ficha_de_wikiproyecto&limit=5000&from=0&namespace=102"
)
data = data.split('<!-- bodytext -->')[1].split('<!-- /bodytext -->')[0]
m = re.compile(ur'title="Wikiproyecto:(.*?)">Wikiproyecto').finditer(data)

salida = u"{{/begin}}"

projectslists = []
for i in m:
    projectslists.append(u"Wikiproyecto:%s/participantes" % i.group(1))

projects = []
gen = pagegenerators.PagesFromTitlesGenerator(projectslists, site)
pre = pagegenerators.PreloadingGenerator(gen, pageNumber=100, lookahead=10)
for p in pre:
    if p.exists() and not p.isRedirectPage():
        contpart = 0
        parttext = p.get()
        n = re.compile(ur'(?i)(\[\[(User|Usuario):|\{\{ *u *\|)').finditer(
            parttext)
        for i in n:
            contpart += 1
        project = p.title().split(':')[1].split('/')[0]
        wikipedia.output(ur'Wikiproyecto:%s [%d participantes]' %
                         (project, contpart))
        projects.append([contpart, project])

projects.sort()

Ejemplo n.º 9

0

Mostrar archivo

def main():
    wikipedia.handleArgs()
    wikipedia.setAction(
        'Erwin85Bot: [[Sjabloon:nocat|nocat]] toegevoegd ([[:Categorie:Wikipedia:Nog te categoriseren]]).'
    )
    noCatR = re.compile(r'\{\{([Xx]n|[Nn])ocat(\|[^\}]*?|)\}\}')
    excludeR = re.compile(
        r'\{\{(x{0,1}wiu|x{0,1}weg|nuweg|artikelweg|auteur|ne|reclame|wb|wiu2)(|\|[^\}]*?)\}\}',
        re.IGNORECASE)

    #List of page_titles which are treated
    titlelist = []

    wikipedia.output(u'Getting a list of uncategorized articles.')
    sql = """
            SELECT page_title
            FROM nlwiki_p.page
            LEFT JOIN nlwiki_p.categorylinks AS c1
            ON c1.cl_from = page_id
            WHERE page_is_redirect = 0
            AND page_namespace = 0
            AND page_len > 0
            AND (   cl_to IS NULL
                    OR
                    NOT EXISTS (SELECT *
                               FROM nlwiki_p.categorylinks AS c2
                               WHERE c2.cl_from = page_id
                               AND ( c2.cl_to NOT REGEXP '(Wikipedia:|Portaal:|Gebruiker:)'
                                        OR c2.cl_to LIKE 'Wikipedia:Nog_te_categoriseren%'
                                        OR c2.cl_to LIKE 'Wikipedia:Verwijderbaar/%'
                                   ))
                 )
            AND NOT EXISTS (SELECT *
                            FROM nlwiki_p.templatelinks
                            WHERE tl_from = page_id
                            AND tl_title = 'Dp'
                            AND tl_namespace = 10)
            GROUP BY page_id;
            """
    results = db.do(sql)
    if not results:
        wikipedia.output('No uncategorized mainspace articles')

    titles = [unicode(result['page_title'], 'utf8') for result in results]

    gen = pagegenerators.PagesFromTitlesGenerator(titles)
    gen = pagegenerators.PreloadingGenerator(gen)

    for page in gen:
        wikipedia.output(u'\n>>> %s <<<' % page.title())
        try:
            # Load the page's text from the wiki.
            original_text = page.get()

        #Redirect, so ignore
        except wikipedia.IsRedirectPage:
            wikipedia.output(u'Pagina is een doorverwijzing.')
            continue

        #No page, so ignore
        except wikipedia.NoPage:
            wikipedia.output(u'Pagina bestaat niet.')
            continue

        new_text = original_text
        if page.categories():
            wikipedia.output(u'Pagina zit in een categorie.')
            continue

        if noCatR.search(original_text) or excludeR.search(original_text):
            wikipedia.output(
                u'Pagina is al getagged met nocat of een ander sjabloon.')
        else:
            new_text = erwin85bot.addTemplate(
                original_text, 'nocat',
                '||{{subst:LOCALYEAR}}|{{subst:LOCALMONTH}}|{{subst:LOCALDAY2}}'
            )

        if not new_text == original_text:
            try:
                page.put(new_text)
            except:
                continue

Ejemplo n.º 10

0

Mostrar archivo

Archivo: missingarticles.py Proyecto: edgarskos/toolserver

        speed = 250
        talkgen = pagegenerators.CategorizedPageGenerator(category,
                                                          recurse=False,
                                                          start=None)
        talkpre = pagegenerators.PreloadingGenerator(talkgen, pageNumber=speed)

        pagetitles = []
        for talkpage in talkpre:
            try:
                wtitle = talkpage.title().split('Talk:')[1]
                if wtitle not in pagetitles:
                    pagetitles.append(wtitle)
            except:
                pass  #no talk page, probably template talk: or other, skip

        gen = pagegenerators.PagesFromTitlesGenerator(pagetitles, site=site)
        pre = pagegenerators.PreloadingGenerator(gen, pageNumber=speed)
        alllinks = {}
        for page in pre:
            if not page.exists() or page.isRedirectPage():
                continue
            wtext = page.get()
            links = getLinks(wtext)
            #wikipedia.output('%s - %d' % (page.title(), len(links)))
            links = set(links)  #only 1 link per page, no dupes
            #sum
            for link in links:
                if not link:
                    continue
                if alllinks.has_key(link):
                    alllinks[link] += 1