Beispiel #1
0
def get_wiki_from_iso_redirect():
    import pywikibot
    site = pywikibot.Site('en', 'wikipedia')
    for item in LanguageIsoWiki.objects.all():
        content = item.content
        # print content
        m = re.search('^#redirect ?\[\[(?P<redirect>[^]]+)\]\]', content, re.IGNORECASE)
        # if not m:
        #     print content
        if m:
            redirect = m.group('redirect')
            print redirect

            if item.redirect:
                print "×", "already exists"
                continue

            page = pywikibot.Page(site, redirect)
            try:
                content_redirect = page.get(get_redirect=True)
                content_redirect = remove_utf8mb4(content_redirect)
                print "→", "ok"
            except pywikibot.exceptions.NoPage:
                content_redirect = ""
                print "#", "doesn't exits!", "#" * 200

            item.redirect = redirect
            item.content_redirect = content_redirect
            item.save()
Beispiel #2
0
    def get_or_load(self, title):
        try:
            return self.get(title=title)
        except ObjectDoesNotExist:
            import pywikibot

            site = pywikibot.getSite(self.lang, "wikipedia")
            page = pywikibot.Page(site, title)
            try:
                content = page.get(get_redirect=True)
                content = remove_utf8mb4(content)
                print "creating NEW wiki:", title
                wiki = self.create(title=title, exists=True)
                if page.isRedirectPage():
                    print " process redirect:", content
                    m = re.search(
                        u"^#(перенаправление|redirect)[:\s]*\[\[(?P<redirect>[^]]+)\]\]", content.strip(), re.IGNORECASE
                    )
                    if not m:
                        raise Exception("Can't parse redirect")
                    redirect = m.group("redirect")
                    wiki.redirect_to = self.get_or_load(redirect)
                    wiki.save()
                self.content_model.objects.create(wiki=wiki, content=content)
            except pywikibot.exceptions.NoPage:
                print "creating EMPTY wiki:", title
                wiki = self.create(title=title, exists=False)
                self.content_model.objects.create(wiki=wiki)
            return wiki
Beispiel #3
0
 def get_data(self, item):
     try:
         content = item.get(get_redirect=True)
         content_to_db = remove_utf8mb4(content)
         edited = aware(convert_wiki_date(item.editTime()))
         redirect = item.isRedirectPage()
         return content, content_to_db, edited, redirect
     except NoPage:
         return None, None, None, None
Beispiel #4
0
def get_eng_languages_from_wiki():
    import pywikibot
    site = pywikibot.Site('en', 'wikipedia')
    category = pywikibot.Category(site, u"Category:Languages_with_ISO_639-2_code")
    for item in category.articles():
        title = item.title()
        print title
        content = item.get()
        content = remove_utf8mb4(content)
        LanguageRuWiki.objects.create(ru_cat=title, wiki_lang='en',
                                      content=content)
Beispiel #5
0
def get_wiki_from_iso():
    import pywikibot
    site = pywikibot.Site('en', 'wikipedia')
    for item in LanguageIso.objects.all():
        lang = item.lang
        print lang
        page = pywikibot.Page(site, u"ISO_639:%s" % lang)
        try:
            content = page.get(get_redirect=True)
            content = remove_utf8mb4(content)
            LanguageIsoWiki.objects.create(lang=lang, content=content)
            print "→", "created"
        except pywikibot.exceptions.NoPage:
            LanguageIsoWiki.objects.create(lang=lang, content="")
            print "#", "doesn't exits!", "#" * 200
Beispiel #6
0
def get_rus_languages_from_wiki():
    import pywikibot
    site = pywikibot.Site('ru', 'wikipedia')
    category = pywikibot.Category(site, u"Категория:Языки_и_диалекты_по_алфавиту")
    skip = True
    for item in category.articles():
        title = item.title()
        if title == u"Готский язык":
            skip = False
        if skip:
            print u'×', title
            continue
        print title
        content = item.get()
        content = remove_utf8mb4(content)
        LanguageRuWiki.objects.create(ru_cat=title, content=content)
Beispiel #7
0
def get_eng_languages_from_wikt_wiki():
    import pywikibot
    site = pywikibot.Site('en', 'wikipedia')
    for item in LanguageData.objects.all():
        title = item.en_cat
        print title
        data, created = LanguageRuWiki.objects.get_or_create(ru_cat=title,
                                                             wiki_lang='en')
        if created:
            page = pywikibot.Page(site, title)
            try:
                content = page.get(get_redirect=True)
            except pywikibot.exceptions.NoPage:
                print "# doesn't exits!"
                continue
            content = remove_utf8mb4(content)
            data.content = content
            data.save()
            print '→ created!'
        else:
            print '× already exist'
Beispiel #8
0
    if not orig_title.startswith(prefix):
        continue
    title = orig_title[len(prefix) :]
    print title
    if title == u"сущ":
        continue

    if not re.match(u"^сущ/[fmn]\d/\d+$", title):
        continue

    # print 'ok'
    # continue
    old = WordInflectionMassEdit.objects.get(title=title)
    old_content = old.content

    new_content = remove_utf8mb4(item.get(get_redirect=True))
    edited = aware(convert_wiki_date(item.editTime()))
    print edited

    update_content = new_content
    if new_content != old_content:
        if u"{{пишу}}" in new_content or u"{{message box" in new_content:
            print u"×" * 20, u"{{пишу}} detected"
            first_edited = None
            break
        changing_started = False
        old_items, old_reports = parse_mass_edit(old_content)
        new_items, new_reports = parse_mass_edit(new_content)
        if len(old_items) != len(new_items) or set(old_items.keys()) != set(new_items.keys()):
            print u"×" * 20, "WRONG DIFFERENCE IN TITLES (COUNTS OR VALUES)"
            continue
Beispiel #9
0
def load_missed_pages():
    print 'load_missed_pages()'
    g = AllpagesPageGenerator(start=u"!")
    i = 0
    j = 0
    redirects = []
    edits = []
    contents = []
    # pages = []
    print dt(), 'starting cycle?'
    for item in g:
        j += 1
        if not j % 1000:
            print dt(), 'j =', j
        if j < 178000:
            continue
        #print item
        title = item.title()
        if ':' in title:
            print dt(), ':' * 20, title
            # continue
        # print title
        try:
            # print dt(), title
            if title != remove_utf8mb4(title):
                print dt(), title, '#' * 30, 'UTF8-MB4'
                continue
            Page.objects.get(title=title)
        # except MultipleObjectsReturned:
        #     print '#' * 10, 'MULTIPLE'
        except ObjectDoesNotExist:
            print dt(), title, '(ok)'
            try:
                content = item.get(get_redirect=True)
            except NoPage:
                print dt(), '#' * 30, 'NO_PAGE'
                continue
            content = remove_utf8mb4(content)
            redirect = item.isRedirectPage()
            # edited = item.editTime()
            edited = convert_wiki_date(item.editTime())
            # edited = make_naive(wiki_page.editTime(), pytz.UTC)
            edited = make_aware(edited, pytz.UTC)
            # edited = make_aware(item.editTime(), pytz.UTC)
            # edited = make_aware(item.editTime(), None)
            # print edited
            page = Page.objects.create(title=title)
            PageContent.objects.create(page=page, content=content)
            PageRedirect.objects.create(page=page, redirect=redirect)
            PageEdited.objects.create(page=page, edited=edited)

            # contents.append(PageContent(page=page, content=content))
            # if len(contents) > 100:
            #     PageContent.objects.bulk_create(contents)
            #     contents = []
            #     print '#' * 30, 'contents added'
            #
            # redirects.append(PageRedirect(page=page, redirect=redirect))
            # if len(redirects) > 1000:
            #     PageRedirect.objects.bulk_create(redirects)
            #     redirects = []
            #     print '#' * 30, 'redirects added'
            #
            # edits.append(PageEdited(page=page, edited=edited))
            # if len(edits) > 1000:
            #     PageEdited.objects.bulk_create(edits)
            #     edits = []
            #     print '#' * 30, 'edits added'

            # break
            i += 1
            if not i % 100:
                print dt(), '*' * 20, i
                # PageContent.objects.bulk_create(pages)
                # pages = []
                sleep(1)
Beispiel #10
0
def update_missed_data():
    print 'update_missed_data()'
    i = 0
    redirects = []
    edits = []
    for page in Page.objects.iterate():
        i += 1
        if i < 35000:
            continue
        if not i % 1000:
            print dt(), 'i =', i
        need_content = need_redirect = need_edited = False
        try:
            PageContent.objects.get(page=page)
        except ObjectDoesNotExist:
            need_content = True
        try:
            PageEdited.objects.get(page=page)
        except ObjectDoesNotExist:
            need_edited = True
        try:
            PageRedirect.objects.get(page=page)
        except ObjectDoesNotExist:
            need_redirect = True
        if need_content or need_edited or need_redirect:
            print dt(), page.title
            site = pywikibot.Site('ru')
            # wiki_page = pywikibot.Page(site, page.title.decode('utf-8'))
            wiki_page = pywikibot.Page(site, page.title)
            try:
                if need_content:
                    content = wiki_page.get(get_redirect=True)
                    content = remove_utf8mb4(content)
                    PageContent.objects.create(page=page, content=content)
                    print '- contents added'
                if need_redirect:
                    redirect = wiki_page.isRedirectPage()
                    PageRedirect.objects.create(page=page, redirect=redirect)
                    print '- redirect added'
                    # redirects.append(PageRedirect(page=page,
                    #                                redirect=redirect))
                    # if len(redirects) > 1000:
                    #     PageRedirect.objects.bulk_create(redirects)
                    #     redirects = []
                    #     print '#' * 30, 'redirects added'
                if need_edited:
                    edited = convert_wiki_date(wiki_page.editTime())
                    # edited = make_naive(wiki_page.editTime(), pytz.UTC)
                    # edited = make_aware(wiki_page.editTime(), None)
                    edited = make_aware(edited, pytz.UTC)
                    PageEdited.objects.create(page=page, edited=edited)
                    print '- edited added'
                    # edits.append(PageEdited(page=page, edited=edited))
                    # if len(edits) > 1000:
                    #     PageEdited.objects.bulk_create(edits)
                    #     edits = []
                    #     print '#' * 30, 'edits added'
            except NoPage:
                print dt(), '#' * 20, page.title, 'DELETED?'  # todo: remove?
                content = page.content if not need_content else ''
                redirect = page.page_redirect.redirect if not need_redirect else None
                edited = page.page_edited.edited if not need_edited else None
                PageDeleted.objects.create(page=page.pk, title=page.title,
                                            content=content, redirect=redirect,
                                            edited=edited)
                # if not need_content:
                #     page.page_content.delete()
                page.delete()

    PageRedirect.objects.bulk_create(redirects)
    PageEdited.objects.bulk_create(edits)
Beispiel #11
0
def process_recent():
    print 'process_recent()'
    i = 0
    edited = PageEdited.objects.order_by('-edited')[0]
    print 'updating untill:', edited.edited
    end = datetime(edited.edited.year,
                   edited.edited.month,
                   edited.edited.day,
                   edited.edited.hour,
                   # edited.edited.minute,
                   )
    g = RecentChangesPageGenerator(end=end)
    for item in g:
        title = item.title()
        if ':' in title:
            continue
        i += 1
        # print title
        # print repr(title)
        # print repr(title.encode('utf-8'))
        # print repr(title.encode('utf-16be'))
        # print repr(title.encode('utf-32be'))
        # # print repr(title.encode('utf-16').decode('utf-16'))
        # page = Page.objects.create(title=title)
        # page = Page.objects.create(title=title.encode('utf-16be'))
        # page = Page.objects.create(title=title.encode('utf-16'))
        # page = Page.objects.create(title=title.encode('utf-8'))
        # # page = Page.objects.create(title=title.encode('utf-8'))
        # # page, created = Page.objects.get_or_create(title=title)
        # continue

        try:
            page, created = Page.objects.get_or_create(title=title)
        except Exception:
            print dt(), title, '-', '@' * 30, 'bad title'
            continue

        try:
            edited = convert_wiki_date(item.editTime())
            edited = make_aware(edited, pytz.UTC)
        except NoPage:
            print dt(), title, '-', 'DELETED', '#' * 20
            # todo: А если страница еще не была добавлена в БД? Т.е. создание/удаление в пределах одной проверки
            content, created = PageContent.objects.get_or_create(page=page)
            edited, created = PageEdited.objects.get_or_create(page=page)
            redirect, created = PageRedirect.objects.get_or_create(page=page)
            PageDeleted.objects.create(page=page.pk, title=page.title,
                                        content=content.content,
                                        edited=edited.edited,
                                        redirect=redirect.redirect)
            page.delete()
            continue
        content = item.get(get_redirect=True)
        content = remove_utf8mb4(content)
        redirect = item.isRedirectPage()

        print dt(), edited, '-', i, '-', title, '| pk =', page.pk
        if created:
            print '- page CREATED: %s' % page.pk

        # try:
        #     page = Page.objects.get(title__exact=title)
        # except ObjectDoesNotExist:
        #     page = Page.objects.create(title=title)

        data, created = PageContent.objects.get_or_create(page=page)
        if data.content != content:
            data.content = content
            data.save()
            print '- contents updated'

        data, created = PageRedirect.objects.get_or_create(page=page)
        if data.redirect != redirect:
            data.redirect = redirect
            data.save()
            print '- redirect updated'

        data, created = PageEdited.objects.get_or_create(page=page)
        if data.edited != edited:
            data.edited = edited
            data.save()
            print '- edited updated'