Exemple #1
0
 def test_intersect_newpages_twice(self):
     """Test newpages intersection."""
     site = self.get_site()
     self.assertEqualItertools([
         pagegenerators.NewpagesPageGenerator(site=site, total=10),
         pagegenerators.NewpagesPageGenerator(site=site, total=10)
     ])
Exemple #2
0
def _get_test_unconnected_page(site):
    """Get unconnected page from site for tests."""
    gen = pagegenerators.NewpagesPageGenerator(site=site, total=10,
                                               namespaces=[1, ])
    for page in gen:
        if not page.properties().get('wikibase_item'):
            return page
Exemple #3
0
 def test_intersect_newpages_and_recentchanges(self):
     """Test intersection betweem newpages and recentchanges."""
     site = self.get_site()
     self.assertEqualItertools([
         pagegenerators.NewpagesPageGenerator(site=site, total=50),
         pagegenerators.RecentChangesPageGenerator(site=site, total=200)
     ])
 def test_intersect_newpages_csd(self):
     site = self.get_site()
     self.assertEqualItertools([
         pagegenerators.NewpagesPageGenerator(site=site, total=10),
         pagegenerators.CategorizedPageGenerator(
             pywikibot.Category(site,
                                'Category:Candidates_for_speedy_deletion'))
     ])
Exemple #5
0
# Pages per file
limit = 500

output = "core/articles/newpages.txt"

# Remove old file
print("removing old file...")
try:
    os.remove(output)

except FileNotFoundError:
    pass

output_file = open(output, "a")

site = pywikibot.Site()
gen = pagegenerators.NewpagesPageGenerator(site=None,
                                           namespaces=[0],
                                           total=limit)

print("now writing...")

for page in gen:
    output_file.write(page.title() + "\n")
    print(page.title())

output_file.close()

print("saved to", output)
print("done")
Exemple #6
0
def main():
    summary_commandline, gen, template = None, None, None
    namespaces, PageTitles, exceptions = [], [], []
    encat, newcatfile = '', ''
    autoText, autoTitle = False, False
    recentcat, newcat = False, False
    genFactory = pagegenerators.GeneratorFactory()
    for arg in pywikibot.handleArgs():
        if arg == '-autotitle':
            autoTitle = True
        elif arg == '-autotext':
            autoText = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(
                    pywikibot.input(u'Which page do you want to chage?'))
            else:
                PageTitles.append(arg[6:])
            break
        elif arg.startswith('-except:'):
            exceptions.append(arg[8:])
        elif arg.startswith('-template:'):
            template = arg[10:]
        elif arg.startswith('-facat:'):
            encat = arg[7:].replace(u'Category:',
                                    u'').replace(u'category:',
                                                 u'').replace(u'رده:', u'')
            encat = englishdictionry(u'رده:' + encat, fa_site,
                                     en_site).replace(u'Category:',
                                                      u'').replace(
                                                          u'category:', u'')
            break
        elif arg.startswith('-encat:'):
            encat = arg[7:].replace(u'Category:',
                                    u'').replace(u'category:',
                                                 u'').replace(u'رده:', u'')
            break
        elif arg.startswith('-newcatfile:'):
            newcatfile = arg[12:]
            break
        elif arg.startswith('-recentcat'):
            arg = arg.replace(':', '')
            if len(arg) == 10:
                genfa = pagegenerators.RecentchangesPageGenerator()
            else:
                genfa = pagegenerators.RecentchangesPageGenerator(
                    number=int(arg[10:]))
            genfa = pagegenerators.DuplicateFilterPageGenerator(genfa)
            genfa = pagegenerators.NamespaceFilterPageGenerator(genfa, [14])
            preloadingGen = pagegenerators.PreloadingGenerator(genfa, 60)
            recentcat = True
            break
        elif arg.startswith('-newcat'):
            arg = arg.replace(':', '')
            if len(arg) == 7:
                genfa = pagegenerators.NewpagesPageGenerator(step=100,
                                                             namespaces=14)
            else:
                genfa = pagegenerators.NewpagesPageGenerator(step=int(arg[7:]),
                                                             namespaces=14)
            preloadingGen = pagegenerators.PreloadingGenerator(genfa, 60)
            newcat = True
            break
        elif arg.startswith('-namespace:'):
            namespaces.append(int(arg[11:]))
        elif arg.startswith('-summary:'):
            pywikibot.setAction(arg[9:])
            summary_commandline = True
        else:
            generator = genFactory.handleArg(arg)
            if generator:
                gen = genFactory.getCombinedGenerator(gen)
    if encat != '':
        encatfalist, encatlists = encatlist(encat)
        if encatlists:
            for encat in encatlists:
                encat = englishdictionry(encat, en_site, fa_site)
                if encat:
                    run([encat])
        if encatfalist is not False:
            run(encatfalist)
    if PageTitles:
        pages = [
            pywikibot.Page(fa_site, PageTitle) for PageTitle in PageTitles
        ]
        gen = iter(pages)
    if recentcat:
        for workpage in preloadingGen:
            workpage = workpage.title()
            cat = pywikibot.Category(fa_site, workpage)
            gent = pagegenerators.CategorizedPageGenerator(cat)
            run(gent)
        pywikibot.stopme()
        sys.exit()
    if newcat:
        for workpage in preloadingGen:
            workpage = workpage.title()
            workpage = englishdictionry(workpage, fa_site, en_site)
            if workpage is not False:
                encatfalist, encatlists = encatlist(workpage)
                if encatlists:
                    for encat in encatlists:
                        encat = englishdictionry(encat, en_site, fa_site)
                        if encat:
                            run([encat])
                if encatfalist is not False:
                    run(encatfalist)
        pywikibot.stopme()
        sys.exit()
    if newcatfile:
        text2 = codecs.open(newcatfile, 'r', 'utf8')
        text = text2.read()
        linken = re.findall(ur'\[\[.*?\]\]', text, re.S)
        if linken:
            for workpage in linken:
                workpage = workpage.split(u'|')[0].replace(u'[[', u'').replace(
                    u']]', u'').strip()
                workpage = englishdictionry(workpage, fa_site, en_site)
                if workpage is not False:
                    encatfalist, encatlists = encatlist(workpage)
                    if encatlists:
                        run(encatlists)
                    if encatfalist is not False:
                        run(encatfalist)
        pywikibot.stopme()
        sys.exit()
    if not gen:
        pywikibot.stopme()
        sys.exit()
    if namespaces != []:
        gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=60)
    run(preloadingGen)
def main():
    lang = 'en'
    wikisite = pywikibot.Site(lang, 'wikipedia')
    wdsite = pywikibot.Site('wikidata', 'wikidata')
    repo = wdsite.data_repository()
    total = 100
    if len(sys.argv) >= 2:
        total = int(sys.argv[1])
    gen = pagegenerators.NewpagesPageGenerator(site=wikisite,
                                               namespaces=[0],
                                               total=total)
    pre = pagegenerators.PreloadingGenerator(gen, groupsize=50)
    for page in pre:
        if page.isRedirectPage():
            continue
        if not pageIsBiography(page=page):
            continue
        print('\n==', page.title().encode('utf-8'), '==')
        gender = calculateGender(page=page)
        item = ''
        try:
            item = pywikibot.ItemPage.fromPage(page)
        except:
            pass
        if item:
            print('Page has item')
            try:
                item.get()
            except:
                print('Error while retrieving item, skiping...')
                continue
            p31 = ''
            p21 = ''
            claims = item.claims
            if claims:
                if 'P31' in item.claims:
                    p31 = item.claims['P31'][0].getTarget()
                if 'P21' in item.claims:
                    p21 = item.claims['P21'][0].getTarget()
            print(page.title().encode('utf-8'), item, gender, p31, p21)
            if not p31:
                addHumanClaim(repo=repo, item=item)
            if not p21:
                addGenderClaim(repo=repo, item=item, gender=gender)
        else:
            print('Page without item')
            #search for a valid item, otherwise create
            if authorIsNewbie(page=page):
                if pageIsRubbish(page=page) or \
                   (not pageCategories(page=page)) or \
                   (not pageReferences(page=page)) or \
                   (not len(list(page.getReferences(namespaces=[0])))):
                    continue

            print(page.title().encode('utf-8'), 'need item', gender)
            wtitle = page.title()
            wtitle_ = wtitle.split('(')[0].strip()
            searchitemurl = 'https://www.wikidata.org/wiki/Special:ItemDisambiguation?language=&label=%s' % (
                urllib.parse.quote(wtitle_))
            raw = getURL(searchitemurl)

            if 'Sorry, no item with that label was found' in raw:
                print('No useful item found. Creating a new one...')
                #create item
                newitemlabels = {lang: wtitle_}
                newitem = pywikibot.ItemPage(repo)
                newitem.editLabels(
                    labels=newitemlabels,
                    summary="BOT - Creating item for [[:%s:%s|%s]] (%s): %s %s"
                    % (lang, wtitle, wtitle, lang, 'human', gender))
                newitem.get()
                addHumanClaim(repo=repo, item=newitem)
                addGenderClaim(repo=repo, item=newitem, gender=gender)
                newitem.setSitelink(
                    page,
                    summary='BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' %
                    (lang, page.title(), page.title(), lang))
            else:
                print(searchitemurl.encode('utf-8'))
                #check birthdate and if it matches add interwiki
                m = re.findall(
                    r'<li class="wikibase-disambiguation"><a title="(Q\d+)"',
                    raw)
                if len(m) > 3:
                    continue
                for itemfoundq in m:
                    itemfound = pywikibot.ItemPage(repo, itemfoundq)
                    itemfound.get()
                    if ('%swiki' % (lang)) in itemfound.sitelinks:
                        continue
                    if 'P569' in itemfound.claims:
                        birthyear = itemfound.claims['P569'][0].getTarget(
                        ).year
                        if birthyear and re.search(
                                r'(?i)\[\[ *Category *\: *%s births *\]\]' %
                            (birthyear), page.text):
                            print(
                                '%s birthyear found in item. Category:%s births found in page'
                                % (birthyear, birthyear))
                            print('Adding sitelink %s:%s' %
                                  (lang, page.title().encode('utf-8')))
                            itemfound.setSitelink(
                                page,
                                summary=
                                'BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' %
                                (lang, page.title(), page.title(), lang))
                            if not 'P31' in itemfound.claims:
                                addHumanClaim(repo=repo, item=itemfound)
                            if not 'P21' in itemfound.claims:
                                addGenderClaim(repo=repo,
                                               item=itemfound,
                                               gender=gender)
                            break
def main():
    wdsite = pywikibot.Site('wikidata', 'wikidata')
    repo = wdsite.data_repository()
    langs = ['en', 'fr', 'de']
    for lang in langs:
        wikisite = pywikibot.Site(lang, 'wikipedia')
        total = 100
        if len(sys.argv) >= 2:
            total = int(sys.argv[1])
        gen = pagegenerators.NewpagesPageGenerator(site=wikisite,
                                                   namespaces=[0],
                                                   total=total)
        #cat = pywikibot.Category(wikisite, 'Category:Articles without Wikidata item')
        #gen = pagegenerators.CategorizedPageGenerator(cat, recurse=False)
        pre = pagegenerators.PreloadingGenerator(gen, groupsize=50)
        for page in pre:
            if page.isRedirectPage():
                continue
            if not pageIsBiography(page=page, lang=lang):
                continue
            print('\n==', page.title().encode('utf-8'), '==')
            gender = calculateGender(page=page, lang=lang)
            item = ''
            try:
                item = pywikibot.ItemPage.fromPage(page)
            except:
                pass
            if item:
                print('Page has item')
                print('https://www.wikidata.org/wiki/%s' % (item.title()))
                addBiographyClaims(repo=repo,
                                   wikisite=wikisite,
                                   item=item,
                                   page=page,
                                   lang=lang)
            else:
                print('Page without item')
                #search for a valid item, otherwise create
                if authorIsNewbie(page=page, lang=lang):
                    print("Newbie author, checking quality...")
                    if pageIsRubbish(page=page, lang=lang) or \
                       (not pageCategories(page=page, lang=lang)) or \
                       (not pageReferences(page=page, lang=lang)) or \
                       (not len(list(page.getReferences(namespaces=[0])))):
                        print("Page didnt pass minimum quality, skiping")
                        continue

                print(page.title().encode('utf-8'), 'need item', gender)
                wtitle = page.title()
                wtitle_ = wtitle.split('(')[0].strip()
                searchitemurl = 'https://www.wikidata.org/w/api.php?action=wbsearchentities&search=%s&language=%s&format=xml' % (
                    urllib.parse.quote(wtitle_), lang)
                raw = getURL(searchitemurl)
                print(searchitemurl.encode('utf-8'))

                #check birthdate and if it matches, then add data
                numcandidates = ''  #do not set to zero
                if not '<search />' in raw:
                    m = re.findall(r'id="(Q\d+)"', raw)
                    numcandidates = len(m)
                    print("Found %s candidates" % (numcandidates))
                    if numcandidates > 5:  #too many candidates, skiping
                        print("Too many, skiping")
                        continue
                    for itemfoundq in m:
                        itemfound = pywikibot.ItemPage(repo, itemfoundq)
                        itemfound.get()
                        if ('%swiki' % (lang)) in itemfound.sitelinks:
                            print("Candidate %s has sitelink, skiping" %
                                  (itemfoundq))
                            numcandidates -= 1
                            continue
                        pagebirthyear = calculateBirthDate(page=page,
                                                           lang=lang)
                        pagebirthyear = pagebirthyear and int(
                            pagebirthyear.split('-')[0]) or ''
                        if not pagebirthyear:
                            print("Page doesnt have birthdate, skiping")
                            break  #break, dont continue. Without birthdate we cant decide correctly
                        if 'P569' in itemfound.claims and itemfound.claims[
                                'P569'][0].getTarget().precision in [
                                    9, 10, 11
                                ]:
                            #https://www.wikidata.org/wiki/Help:Dates#Precision
                            itemfoundbirthyear = int(
                                itemfound.claims['P569'][0].getTarget().year)
                            print(
                                "candidate birthdate = %s, page birthdate = %s"
                                % (itemfoundbirthyear, pagebirthyear))
                            mindatelen = 4
                            if len(str(
                                    itemfoundbirthyear)) != mindatelen or len(
                                        str(pagebirthyear)) != mindatelen:
                                print("%s birthdate length != %s" %
                                      (itemfoundq, mindatelen))
                                continue
                            #reduce candidates if birthyear are different
                            minyeardiff = 3
                            if itemfoundbirthyear >= pagebirthyear + minyeardiff or itemfoundbirthyear <= pagebirthyear - minyeardiff:
                                print(
                                    "Candidate %s birthdate out of range, skiping"
                                    % (itemfoundq))
                                numcandidates -= 1
                                continue
                            #but only assume it is the same person if birthyears match
                            if itemfoundbirthyear == pagebirthyear:
                                print(
                                    '%s birthyear found in candidate %s. Category:%s births found in page. OK!'
                                    % (itemfoundbirthyear, itemfoundq,
                                       itemfoundbirthyear))
                                print('Adding sitelink %s:%s' %
                                      (lang, page.title().encode('utf-8')))
                                try:
                                    itemfound.setSitelink(
                                        page,
                                        summary=
                                        'BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)'
                                        % (lang, page.title(), page.title(),
                                           lang))
                                except:
                                    print("Error adding sitelink. Skiping.")
                                    break
                                addBiographyClaims(repo=repo,
                                                   wikisite=wikisite,
                                                   item=itemfound,
                                                   page=page,
                                                   lang=lang)
                                break

                #no item found, or no candidates are useful
                if '<search />' in raw or (numcandidates == 0):
                    print('No useful item found. Creating a new one...')
                    #create item
                    newitemlabels = {lang: wtitle_}
                    newitem = pywikibot.ItemPage(repo)
                    newitem.editLabels(
                        labels=newitemlabels,
                        summary=
                        "BOT - Creating item for [[:%s:%s|%s]] (%s): %s %s" %
                        (lang, wtitle, wtitle, lang, 'human', gender))
                    newitem.get()
                    try:
                        newitem.setSitelink(
                            page,
                            summary=
                            'BOT - Adding 1 sitelink: [[:%s:%s|%s]] (%s)' %
                            (lang, page.title(), page.title(), lang))
                    except:
                        print("Error adding sitelink. Skiping.")
                        break
                    addBiographyClaims(repo=repo,
                                       wikisite=wikisite,
                                       item=newitem,
                                       page=page,
                                       lang=lang)
Exemple #9
0
def newPages(all=False):
    global nbrModif, nbrTotal
    
    log = u''
    
    #BUGFIX
    bugfixPage = pywikibot.Page(site,u"Utilisateur:LinedBot")
    bugfixPage.save('')
    #END OF FIX

    
    homonCat =  pywikibot.Category(site,u"Homonymie")
    
    ebaucheCat = pywikibot.Category(site,u"Ébauche")
    ebaucheCat = set(ebaucheCat.subcategories(recurse=3))
    
    hiddenCat = pywikibot.Category(site,u"Catégorie cachée")
    hiddenCat = set(hiddenCat.subcategories())
    
    portalCat = pywikibot.Category(site,u"Liste d'articles")
    portalCat = set(portalCat.subcategories())
    
    ignoreCat = pywikibot.Category(site,u"Page ignorée par les robots")
    
    concoursCat = pywikibot.Category(site,u"Article VikiConcours")
    
    deadendPagesList = list(pagegenerators.DeadendPagesPageGenerator(site=site))
    lonelyPagesList = list(pagegenerators.LonelyPagesPageGenerator(site=site))
    
    
    if all:
        pagesList = pagegenerators.AllpagesPageGenerator(namespace=0,includeredirects=False,site=site)
    else:
        pagesList = pagegenerators.NewpagesPageGenerator(total=50,site=site)

    for page in pagesList:
        
        try:
            pageTemp = page.get()
            
        except pywikibot.NoPage:
            pywikibot.output(u"Page %s does not exist; skipping."
                             % page.title(asLink=True))
        except pywikibot.IsRedirectPage:
            pywikibot.output(u"Page %s is a redirect; skipping."
                             % page.title(asLink=True))
        except pywikibot.LockedPage:
            pywikibot.output(u"Page %s is locked; skipping."
                             % page.title(asLink=True))
        else:
            
            
            # On ne s'occupe de la page que si elle n'est ni une homonymie ni une page du VikiConcours
            pageCat = page.categories()
            if (not homonCat in pageCat) and (not concoursCat in pageCat):
                
                #On ne traite l'ajout de bandeau que si la page n'est pas ignorée
                jobList = []
                if not ignoreCat in pageCat:
                    
                    # s'il existe des références, on retire le job 'orphelin'
                    if page in lonelyPagesList:
                        jobList.append(u'orphelin')
                
                    # s'il n'existe aucune catégorie (directe), on ajoute le job 'catégoriser'
                    realCat = list(set(pageCat) - set(hiddenCat) - set(ebaucheCat))
                
                    nbCat = len(list(realCat))
                    if nbCat == 0:
                        jobList.append(u'catégoriser')
                    
                    # si la page n'appartient à aucun portail, on ajoute le job 'portail'
                    nbPort = len(set(pageCat) & set(portalCat))
                    if nbPort == 0:
                        jobList.append(u'portail')
                    
                    
                    # si la page ne pointe vers aucune autre, on ajoute le job 'impasse'
                    if page in deadendPagesList:
                        jobList.append(u'impasse')
                    
                    
                    """
                    # si la page fait plus de 2000 octets et ne contient aucun lien externe
                    if len(pageTemp) > 2000 and len(list(page.extlinks())) == 0:
                        jobList.append(u'sourcer')
                    """
            
                else:
                    print u'Skipping [[' + page.title() + ']], page in ignore list.'


                pageTemp, oldJobList = removeBanner(pageTemp)
                jobList = updateJobList(oldJobList, jobList)
                job = u''

            
                # Différence symétrique entre les deux listes, on regarde si des éléments ne sont pas contenus dans les deux listes : (A-B)+(B-A)
                diff = list(set(oldJobList).symmetric_difference(set(jobList)))

                if diff != []:
                    nbrTotal += 1
                    if len(jobList) > 0:
                        job = ','.join(jobList)
                        banner = u'{{Maintenance|job=' + job + '|date=~~~~~}}\n\n'
                        pageTemp = banner + pageTemp
                        summary = u'[[VD:Robot|Robot]] : Mise à jour du bandeau de maintenance.'
                    else:
                        summary = u'[[VD:Robot|Robot]] : Retrait du bandeau de maintenance.'

                    c = callback.Callback()
                    page.text = pageTemp
                    page.save(summary,callback=c)

                    if c.error == None:
                        nbrModif += 1

                    log +=u'*' + '{{Utilisateur:LinedBot/ExtLinker|' + page.title() + u'}} : Mise à jour du bandeau {{m|maintenance}} avec les paramètres suivants : ' + job + '\n'

    return log
Exemple #10
0
def get_test_unconnected_page(site):
    gen = pagegenerators.NewpagesPageGenerator(site=site, total=1)
    return next(gen)