Ejemplo n.º 1
0
def get_batch_of_files(current_site, batches_of, api_step, start_from):

    start_getting_allfiles = time.clock()

    # print u'Requesting {0} files'.format(batches_of) #debug

    batch_of_files_generator = pagegenerators.AllpagesPageGenerator(
        site=current_site,
        start=start_from,
        namespace=6,
        includeredirects=False,
        total=batches_of,
        step=api_step,
        content=False)

    done_getting_allfiles = time.clock()

    batch_of_files = []

    for page in batch_of_files_generator:
        batch_of_files.append(page)

    # print u'Got {0} pages in {1}s'.format(len(batch_of_files), done_getting_allfiles - start_getting_allfiles) #debug

    return batch_of_files
Ejemplo n.º 2
0
def main(*args):
    gen = pagegenerators.AllpagesPageGenerator(start="Icon-ffxiv-",
                                               namespace=6)
    for page in gen:
        if "-ffxiv-" not in page.title():
            break
        kill(page)
Ejemplo n.º 3
0
def main():
    source = pywikibot.getSite('fr', 'vikidia')
    pagesList = pagegenerators.AllpagesPageGenerator(namespace=0,
                                                     includeredirects=False,
                                                     site=source,
                                                     start=u"")
    for page in pagesList:
        print page.title()
        inter(page)
Ejemplo n.º 4
0
 def custom_generator(self):
     end = self.getOption('end')
     for page in pagegenerators.AllpagesPageGenerator(
             start=self.getOption('start'),
             namespace=120,
             site=self.repo,
             total=self.getOption('total')):
         yield page
         if page.title(with_ns=False) == end:
             break
Ejemplo n.º 5
0
 def custom_generator(self):
     end = self.opt['end']
     for page in pagegenerators.AllpagesPageGenerator(
             start=self.opt['start'],
             namespace=120,
             site=self.repo,
             total=self.opt['total']):
         yield page
         if page.title(with_ns=False) == end:
             break
Ejemplo n.º 6
0
def main():
    singlepage = []
    gen = None
    start = None
    action = None
    for arg in pywikibot.handleArgs():
        if arg == ('pages'):
            action = 'pages'
        elif arg == ('categories'):
            action = 'categories'
        elif arg.startswith('-start:'):
            start = pywikibot.Page(pywikibot.Site(), arg[7:])
            gen = pagegenerators.AllpagesPageGenerator(
                start.title(withNamespace=False),
                namespace=start.namespace(),
                includeredirects=False)
        elif arg.startswith('-cat:'):
            cat = pywikibot.Category(pywikibot.Site(), 'Category:%s' % arg[5:])
            gen = pagegenerators.CategorizedPageGenerator(cat)
        elif arg.startswith('-ref:'):
            ref = pywikibot.Page(pywikibot.Site(), arg[5:])
            gen = pagegenerators.ReferringPageGenerator(ref)
        elif arg.startswith('-link:'):
            link = pywikibot.Page(pywikibot.Site(), arg[6:])
            gen = pagegenerators.LinkedPageGenerator(link)
        elif arg.startswith('-page:'):
            singlepage = pywikibot.Page(pywikibot.Site(), arg[6:])
            gen = iter([singlepage])
        #else:
        #bug

    if action == 'pages':
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = CommonsLinkBot(preloadingGen, acceptall=False)
        bot.pages()
    elif action == 'categories':
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = CommonsLinkBot(preloadingGen, acceptall=False)
        bot.categories()
    else:
        pywikibot.showHelp(u'commons_link')
Ejemplo n.º 7
0
        choice = 'y'
        if not acceptAll:
            choice = pywikibot.inputChoice(
                u'Do you want to accept these changes?',
                ['Yes', 'No', 'All', 'Quit'], ['y', 'N', 'a', 'q'], 'N')
            if choice == 'q':
                exit()
            if choice == 'a':
                acceptAll = True
                choice = 'y'
        if choice == 'y':
            self.page.put(newText, editSummary)
            pass

    def run(self):
        self.addZeroSection()


# main code
if __name__ == '__main__':
    common.login(username)
    talkPages = pagegenerators.AllpagesPageGenerator(namespace=1,
                                                     includeredirects=False,
                                                     site=common.getWikiSite())
    #talkPages = [pywikibot.Page(site = dxdCommonLibrary.getWikiSite() ,title = u'Обговорення користувача:RLuts')]
    talkPages = pagegenerators.PreloadingGenerator(talkPages)
    # do work
    for page in talkPages:
        leadSectionFormatter = LeadSectionFormatter(page)
        leadSectionFormatter.run()
Ejemplo n.º 8
0
def main():

    site = pywikibot.getSite('pl', 'wikinews')
    lista_stron = pagegenerators.AllpagesPageGenerator(site=site)

    re_cytat = re.compile('{{[cC]ytat\|(.*?)}}', re.DOTALL)
    re_tresc = re.compile(
        '(.*?)($|\|2=|\|3=|\|4=|\|5=|\|[0-9]*px\|[0-9]*px\|)', re.DOTALL)
    re_autor = re.compile('(4=|\|[0-9]*px\|[0-9]*px\|)(.*?)($|\|)', re.DOTALL)
    re_zrodlo = re.compile('5=(.*)', re.DOTALL)

    re_cytatlewy = re.compile('{{[cC]ytatLewy\|(.*?)}}', re.DOTALL)
    re_cytatprawy = re.compile('{{[cC]ytatPrawy\|(.*?)}}', re.DOTALL)

    for a in lista_stron:
        log = ''
        try:
            strona = a.get()
        except pywikibot.IsRedirectPage:
            #print u'[[%s]] - przekierowanie' % a.title()
            log = log + '\n*[[%s]] - przekierowanie' % a.title()
        except pywikibot.Error:
            print('[[%s]] - błąd' % a.title())
            log = log + '\n*[[%s]] - błąd' % a.title()
        else:

            s_cytat = re.findall(re_cytat, a.get())
            for b in s_cytat:
                final = ''
                s_tresc = re.search(re_tresc, b)
                s_autor = re.search(re_autor, b)
                s_zrodlo = re.search(re_zrodlo, b)
                print(b)
                print('\n\n')
                if s_tresc:
                    final = final + '\n\'\'\'treść\'\'\': %s' % s_tresc.group(
                        1)
                    #print u'\n\'\'\'treść\'\'\': %s' % s_tresc.group(1)
                if s_autor:
                    final = final + '\n:\'\'\'autor\'\'\': %s' % (
                        s_autor.group(2))
                    print('\n:\'\'\'autor\'\'\': %s' % (s_autor.group(2)))
                if s_zrodlo:
                    final = final + '\n:\'\'\'źródło\'\'\': %s' % s_zrodlo.group(
                        1)
                    #print u'\n:\'\'\'źródło\'\'\': %s' % s_zrodlo.group(1)

                final = final + '\n:\'\'\'link\'\'\': [[%s]]<br/><br/>' % a.title(
                )
                #print u'\n:\'\'\'link\'\'\': [[%s]]<br/><br/>\n' % a.title()

                #print final
                file = open("output/wikinews.txt", 'a')
                file.write(final.encode("utf-8"))
                file.close

            s_cytatlewy = re.findall(re_cytatlewy, a.get())
            for c in s_cytatlewy:
                final_l = ''
                final_l = final_l + '\n\'\'\'treść\'\'\': %s' % c
                final_l = final_l + '\n:\'\'\'link\'\'\': [[%s]]<br/><br/>' % a.title(
                )
                file = open("output/wikinews_lewy.txt", 'a')
                file.write(final_l.encode("utf-8"))
                file.close

            s_cytatprawy = re.findall(re_cytatprawy, a.get())
            for d in s_cytatprawy:
                final_p = ''
                final_p = final_p + '\n\'\'\'treść\'\'\': %s' % d
                final_p = final_p + '\n:\'\'\'link\'\'\': [[%s]]<br/><br/>' % a.title(
                )
                file = open("output/wikinews_prawy.txt", 'a')
                file.write(final_p.encode("utf-8"))
                file.close

        file = open("log/wikinews.txt", 'a')
        file.write(log.encode("utf-8"))
        file.close
Ejemplo n.º 9
0
from lib.googlegeocode import GoogleGeocode

settings = ConfigParser.ConfigParser()
settings.read('../../configs/settings.ini')

with open('config.json') as config_file:
    config = json.load(config_file)

motorway_regex = '[A-Z]?-?\d+\s*(\((\w|\s)+\))'
border_regex = '.*border (crossing|checkpoint)'

geonames = GeoNames(settings.get('vendor', 'geonames_username'), './.cache')
google_geocode = GoogleGeocode('./.cache')

site = pywikibot.Site()
gen = pagegenerators.AllpagesPageGenerator(site=site)
disamb_cat = pywikibot.Category(site, 'Disambiguation')
disamb_pages = [article.title() for article in disamb_cat.articles()]

count = 0
for page in gen:
    if not page.isRedirectPage() and page.title() not in disamb_pages:
        print '#%d. %s' % (count + 1, page.title().encode('utf-8'))
        print 'http://' + settings.get(
            'general', 'domain') + '/en/' + page.title(asUrl=True)

        # Uncomment to resume from speciffic point
        #if count < 3055:
        #    count += 1
        #    continue
Ejemplo n.º 10
0
def newPages(all=False):
    global nbrModif, nbrTotal
    
    log = u''
    
    #BUGFIX
    bugfixPage = pywikibot.Page(site,u"Utilisateur:LinedBot")
    bugfixPage.save('')
    #END OF FIX

    
    homonCat =  pywikibot.Category(site,u"Homonymie")
    
    ebaucheCat = pywikibot.Category(site,u"Ébauche")
    ebaucheCat = set(ebaucheCat.subcategories(recurse=3))
    
    hiddenCat = pywikibot.Category(site,u"Catégorie cachée")
    hiddenCat = set(hiddenCat.subcategories())
    
    portalCat = pywikibot.Category(site,u"Liste d'articles")
    portalCat = set(portalCat.subcategories())
    
    ignoreCat = pywikibot.Category(site,u"Page ignorée par les robots")
    
    concoursCat = pywikibot.Category(site,u"Article VikiConcours")
    
    deadendPagesList = list(pagegenerators.DeadendPagesPageGenerator(site=site))
    lonelyPagesList = list(pagegenerators.LonelyPagesPageGenerator(site=site))
    
    
    if all:
        pagesList = pagegenerators.AllpagesPageGenerator(namespace=0,includeredirects=False,site=site)
    else:
        pagesList = pagegenerators.NewpagesPageGenerator(total=50,site=site)

    for page in pagesList:
        
        try:
            pageTemp = page.get()
            
        except pywikibot.NoPage:
            pywikibot.output(u"Page %s does not exist; skipping."
                             % page.title(asLink=True))
        except pywikibot.IsRedirectPage:
            pywikibot.output(u"Page %s is a redirect; skipping."
                             % page.title(asLink=True))
        except pywikibot.LockedPage:
            pywikibot.output(u"Page %s is locked; skipping."
                             % page.title(asLink=True))
        else:
            
            
            # On ne s'occupe de la page que si elle n'est ni une homonymie ni une page du VikiConcours
            pageCat = page.categories()
            if (not homonCat in pageCat) and (not concoursCat in pageCat):
                
                #On ne traite l'ajout de bandeau que si la page n'est pas ignorée
                jobList = []
                if not ignoreCat in pageCat:
                    
                    # s'il existe des références, on retire le job 'orphelin'
                    if page in lonelyPagesList:
                        jobList.append(u'orphelin')
                
                    # s'il n'existe aucune catégorie (directe), on ajoute le job 'catégoriser'
                    realCat = list(set(pageCat) - set(hiddenCat) - set(ebaucheCat))
                
                    nbCat = len(list(realCat))
                    if nbCat == 0:
                        jobList.append(u'catégoriser')
                    
                    # si la page n'appartient à aucun portail, on ajoute le job 'portail'
                    nbPort = len(set(pageCat) & set(portalCat))
                    if nbPort == 0:
                        jobList.append(u'portail')
                    
                    
                    # si la page ne pointe vers aucune autre, on ajoute le job 'impasse'
                    if page in deadendPagesList:
                        jobList.append(u'impasse')
                    
                    
                    """
                    # si la page fait plus de 2000 octets et ne contient aucun lien externe
                    if len(pageTemp) > 2000 and len(list(page.extlinks())) == 0:
                        jobList.append(u'sourcer')
                    """
            
                else:
                    print u'Skipping [[' + page.title() + ']], page in ignore list.'


                pageTemp, oldJobList = removeBanner(pageTemp)
                jobList = updateJobList(oldJobList, jobList)
                job = u''

            
                # Différence symétrique entre les deux listes, on regarde si des éléments ne sont pas contenus dans les deux listes : (A-B)+(B-A)
                diff = list(set(oldJobList).symmetric_difference(set(jobList)))

                if diff != []:
                    nbrTotal += 1
                    if len(jobList) > 0:
                        job = ','.join(jobList)
                        banner = u'{{Maintenance|job=' + job + '|date=~~~~~}}\n\n'
                        pageTemp = banner + pageTemp
                        summary = u'[[VD:Robot|Robot]] : Mise à jour du bandeau de maintenance.'
                    else:
                        summary = u'[[VD:Robot|Robot]] : Retrait du bandeau de maintenance.'

                    c = callback.Callback()
                    page.text = pageTemp
                    page.save(summary,callback=c)

                    if c.error == None:
                        nbrModif += 1

                    log +=u'*' + '{{Utilisateur:LinedBot/ExtLinker|' + page.title() + u'}} : Mise à jour du bandeau {{m|maintenance}} avec les paramètres suivants : ' + job + '\n'

    return log
Ejemplo n.º 11
0
paraules = { # Add the word using regexp and next to it, the word without regexp (see the example). You can add an infinite amount of words.
        u' ([Pp]ag(e|es))( |\.|\,)': u'Page',
        }

def main(page, paraula):
    if page.title() == paraules[paraula]:
        print(u"Skipping %s because is the link page" % str(page))
        return
    substitucio = ' [[' + paraules[paraula] + '|\\1]]\\3'
    #site = pwb.Site()
    #page = pwb.Page(site, page)
    print(u"Page: %s" % page)
    text = page.text
    comptador = re.search(paraula, text)
    if comptador:
        print("======= EDITING PAGE %s! =======" % page)
        noutext = re.sub(paraula, substitucio, text)
        page.text = noutext
        page.save(u'Bot: Adding links for %s' % paraules[paraula])
    else:
        print("No links added")
        return
    
if __name__ == '__main__':
    allpages = pg.AllpagesPageGenerator(site=pwb.Site(), start="!", namespace=0, includeredirects = True)
    pages = pg.PreloadingGenerator(allpages, pageNumber = 100)
    for page in pages:
        for paraula in paraules:
            main(page, paraula)
    print("\nFinished!")