Exemple #1
0
def load_templates():
    for category_name in category_names:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            prefix = u'Шаблон:'
            title = article.title()
            if not title.startswith(prefix):
                print title, '-', 'BAD!', '#' * 40
                continue
            title = title[len(prefix):]
            content = article.get()
            print title
            edited = convert_wiki_date(article.editTime())
            edited = make_aware(edited, pytz.UTC)  # tmp
            try:
                template = TemplateLabel.objects.get(title=title)
                template.edited = edited
                template.category = category_name
                template.content = content
                template.save()
            except ObjectDoesNotExist:
                TemplateLabel.objects.create(
                    title=title,
                    edited=edited,
                    category=category_name,
                    content=content,
                )
Exemple #2
0
def load_redirects():
    prefix = u'Шаблон:'
    for category_name in category_names:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            redirect = article.title()[len(prefix):]
            for page in article.backlinks(filterRedirects=True):
                # print page.title(), '->', article.title()
                title = page.title()
                if not title.startswith(prefix):
                    print title, '-', 'BAD!', '#' * 40
                    continue
                title = title[len(prefix):]
                content = page.get(get_redirect=True)
                print title
                edited = convert_wiki_date(page.editTime())
                edited = make_aware(edited, pytz.UTC)  # tmp
                try:
                    template = TemplateLabel.objects.get(title=title)
                    template.edited = edited
                    template.category = category_name
                    template.content = content
                    template.redirect = redirect
                    template.save()
                except ObjectDoesNotExist:
                    TemplateLabel.objects.create(
                        title=title,
                        edited=edited,
                        category=category_name,
                        content=content,
                        redirect=redirect,
                    )
Exemple #3
0
 def get_data(self, item):
     try:
         content = item.get(get_redirect=True)
         content_to_db = remove_utf8mb4(content)
         edited = aware(convert_wiki_date(item.editTime()))
         redirect = item.isRedirectPage()
         return content, content_to_db, edited, redirect
     except NoPage:
         return None, None, None, None
Exemple #4
0
def load_templates_contents():
    # for template in TemplateInflection.objects.all():
    for template in TemplateInflection.objects.filter(content__isnull=True):
        title = template.title
        print title
        # continue
        article = pywikibot.Page(site, u"Шаблон:%s" % title)
        content = article.get()
        edited = convert_wiki_date(article.editTime())
        edited = make_aware(edited, pytz.UTC)
        template.content = content
        template.edited = edited
        template.save()
Exemple #5
0
def load_templates():
    category_names = [
        u"Категория:Шаблоны словоизменений/Глаголы/Возвратные глаголы",
        u"Категория:Шаблоны словоизменений/Глаголы/Невозвратные глаголы",
        u"Категория:Шаблоны словоизменений/Глаголы/Несовершенный вид",
        u"Категория:Шаблоны словоизменений/Глаголы/Совершенный вид",
        u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые",
        u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые/Мужской род",
        u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые/Женский род",
        u"Категория:Шаблоны словоизменений/Существительные/Одушевлённые/Средний род",
        u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые",
        u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые/Мужской род",
        u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые/Женский род",
        u"Категория:Шаблоны словоизменений/Существительные/Неодушевлённые/Средний род",
        u"Категория:Шаблоны словоизменений/Прилагательные",
        u"Категория:Шаблоны словоизменений/Причастия",
        u"Категория:Шаблоны словоизменений/Числительные",
        u"Категория:Шаблоны словоизменений/Фамилии",
        u"Категория:Шаблоны словоизменений/Местоимения",
    ]

    for category_name in category_names:
        morph = category_name.split('/')[1]
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            prefix = u'Шаблон:'
            # Шаблон:прил ru
            title = article.title()
            if not title.startswith(prefix):
                print title, '-', 'BAD!'
                continue
            title = title[len(prefix):]
            content = article.get()
            print title
            edited = convert_wiki_date(article.editTime())
            edited = make_aware(edited, pytz.UTC)  # tmp
            try:
                TemplateInflection.objects.get(title=title,
                                               content=content,
                                               edited=edited,
                                               morph=morph)
                print '#' * 20, '-> ALREADY EXISTS'
            except ObjectDoesNotExist:
                TemplateInflection.objects.create(title=title,
                                                  content=content,
                                                  edited=edited,
                                                  category=category_name,
                                                  morph=morph)
Exemple #6
0
 def process_potentially_new_item(self, item, i):
     try:
         title = item.title()
     except InvalidTitle:
         print 'Wrong title', '#' * 120
         return
     if ':' in title:
         return
     # try:
     #     page = Page.objects.get(title=title)
     # except (Page.DoesNotExist, _mysql_exceptions.Warning):
     #     return
     content, content_to_db, edited, redirect = self.get_data(item)
     if not edited:
         return
     if not self.output_interval or not i % self.output_interval:
         print dt(),
         if ':' in title:
             print ':',
         else:
             print '#',
         print edited, '-', i, '-',
         if ':' in title:
             print ':',
         else:
             print '#',
         print transliterate(title), ' // pk=%s' % '?'  # page.pk
     # if item.previous_revision_id == -1 and u'{{-ru-' in content:
     if u'{{-ru-' in content:
         created_at = convert_wiki_date(item.oldest_revision.timestamp)
         print transliterate(title), created_at
         # if created_at + timedelta(hours=5) > datetime.now():
         if created_at + timedelta(days=7) > datetime.now():
             if title not in self.titles and item.oldest_revision.user != 'CinBot':
                 print
                 print '-' * 100
                 print '|', transliterate(item.oldest_revision.user), edited
                 print '|', transliterate(title)
                 print '-' * 100
                 print
                 self.items.append((title, item.oldest_revision.user, created_at))
                 self.titles.append(title)
     if ':' in title:
         return
     return edited
def process_template(article, lang):
    prefix = u'Шаблон:'
    title = article.title()
    if not title.startswith(prefix):
        print title, '-', 'BAD!', 'BAD!', '#' * 100
        return
    title = title[len(prefix):]
    print title
    article = pywikibot.Page(site, u"Шаблон:%s" % title)
    content = article.get()
    edited = convert_wiki_date(article.editTime())
    edited = make_aware(edited, pytz.UTC)

    words = title.split(' ')
    morph = words[0]
    if morph not in [u'adv', u'conj', u'interj', u'гл', u'глагол',
                     u'мест', u'прил', u'сущ', u'числ', u'падежи',
                     u'prep', u'affix', u'intro', u'phrase', u'suffix',
                     u'predic', u'склонение', u'part',
                     u'артикль', u'article', u'арт', u'деепр', u'onomatop',
                     u'interj1', u'прич', u'герундий',
                     u'склон', u'степени',
                     u'междом',
                     u'спряжения', u'спряжение', u'словоизм', u'сущ2',
                     u'принад', u'palat', u'abbrev', u'measure',
                     u'morph', u'prefix', u'ein',
                     u'союз', u'словоформы', u'глаг', u'послел',
                     u'послелог', u'падежи-мест', u'нар', u'морфема',
                     u'межд', ]:
        print u'm →', title
        return
    prefix = u"%s %s" % (morph, lang)
    if not title.startswith(prefix):
        print u'e →', title, '(%s)' % lang
        return
    info = title[len(prefix):].strip()

    kind, gender, num = parse_template_title(title)

    return TemplateInflection(
        title=title, content=content, edited=edited, lang=lang,
        morph=morph, info=info, kind=kind, gender=gender, num=num,
    )
Exemple #8
0
def load_templates():
    import pywikibot
    site = pywikibot.Site('ru')
    category_names = [
        u"Категория:Викисловарь:Шаблоны:Языки",
        u"Категория:Шаблоны:Языковые_заголовки",
    ]
    for category_name in category_names:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            title = article.title()
            m = re.match(u'^Шаблон:-([^-].*)-$', title)
            if m:
                lang = m.group(1)
                print lang
                content = article.get(get_redirect=True)
                edited = aware(convert_wiki_date(article.editTime()))
                LanguageTemplate.objects.get_or_create(lang_code=lang,
                                                       edited=edited,
                                                       content=content)
Exemple #9
0
def load_templates():
    import pywikibot
    site = pywikibot.Site('ru')
    category_names = [
        u"Категория:Шаблоны:Названия_языков",
        # u"Категория:Шаблоны:Языковые_заголовки",
    ]
    for category_name in category_names:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            title = article.title()
            m = re.match(u'^Шаблон:(?P<case>[Ll])ang-(?P<lang>.+)$', title)
            # if not m:
            #     print title
            if m:
                lang = m.group('lang')
                case = m.group('case')
                lower = case == 'l'
                print lang
                content = article.get(get_redirect=True)
                edited = aware(convert_wiki_date(article.editTime()))
                TemplateLangNames.objects.get_or_create(
                    lang=lang, lower=lower, edited=edited, content=content)
Exemple #10
0
        continue
    title = orig_title[len(prefix) :]
    print title
    if title == u"сущ":
        continue

    if not re.match(u"^сущ/[fmn]\d/\d+$", title):
        continue

    # print 'ok'
    # continue
    old = WordInflectionMassEdit.objects.get(title=title)
    old_content = old.content

    new_content = remove_utf8mb4(item.get(get_redirect=True))
    edited = aware(convert_wiki_date(item.editTime()))
    print edited

    update_content = new_content
    if new_content != old_content:
        if u"{{пишу}}" in new_content or u"{{message box" in new_content:
            print u"×" * 20, u"{{пишу}} detected"
            first_edited = None
            break
        changing_started = False
        old_items, old_reports = parse_mass_edit(old_content)
        new_items, new_reports = parse_mass_edit(new_content)
        if len(old_items) != len(new_items) or set(old_items.keys()) != set(new_items.keys()):
            print u"×" * 20, "WRONG DIFFERENCE IN TITLES (COUNTS OR VALUES)"
            continue
        for word in sorted(old_items.keys(), key=lambda x: x[::-1]):
Exemple #11
0
def load_missed_pages():
    print 'load_missed_pages()'
    g = AllpagesPageGenerator(start=u"!")
    i = 0
    j = 0
    redirects = []
    edits = []
    contents = []
    # pages = []
    print dt(), 'starting cycle?'
    for item in g:
        j += 1
        if not j % 1000:
            print dt(), 'j =', j
        if j < 178000:
            continue
        #print item
        title = item.title()
        if ':' in title:
            print dt(), ':' * 20, title
            # continue
        # print title
        try:
            # print dt(), title
            if title != remove_utf8mb4(title):
                print dt(), title, '#' * 30, 'UTF8-MB4'
                continue
            Page.objects.get(title=title)
        # except MultipleObjectsReturned:
        #     print '#' * 10, 'MULTIPLE'
        except ObjectDoesNotExist:
            print dt(), title, '(ok)'
            try:
                content = item.get(get_redirect=True)
            except NoPage:
                print dt(), '#' * 30, 'NO_PAGE'
                continue
            content = remove_utf8mb4(content)
            redirect = item.isRedirectPage()
            # edited = item.editTime()
            edited = convert_wiki_date(item.editTime())
            # edited = make_naive(wiki_page.editTime(), pytz.UTC)
            edited = make_aware(edited, pytz.UTC)
            # edited = make_aware(item.editTime(), pytz.UTC)
            # edited = make_aware(item.editTime(), None)
            # print edited
            page = Page.objects.create(title=title)
            PageContent.objects.create(page=page, content=content)
            PageRedirect.objects.create(page=page, redirect=redirect)
            PageEdited.objects.create(page=page, edited=edited)

            # contents.append(PageContent(page=page, content=content))
            # if len(contents) > 100:
            #     PageContent.objects.bulk_create(contents)
            #     contents = []
            #     print '#' * 30, 'contents added'
            #
            # redirects.append(PageRedirect(page=page, redirect=redirect))
            # if len(redirects) > 1000:
            #     PageRedirect.objects.bulk_create(redirects)
            #     redirects = []
            #     print '#' * 30, 'redirects added'
            #
            # edits.append(PageEdited(page=page, edited=edited))
            # if len(edits) > 1000:
            #     PageEdited.objects.bulk_create(edits)
            #     edits = []
            #     print '#' * 30, 'edits added'

            # break
            i += 1
            if not i % 100:
                print dt(), '*' * 20, i
                # PageContent.objects.bulk_create(pages)
                # pages = []
                sleep(1)
Exemple #12
0
def update_missed_data():
    print 'update_missed_data()'
    i = 0
    redirects = []
    edits = []
    for page in Page.objects.iterate():
        i += 1
        if i < 35000:
            continue
        if not i % 1000:
            print dt(), 'i =', i
        need_content = need_redirect = need_edited = False
        try:
            PageContent.objects.get(page=page)
        except ObjectDoesNotExist:
            need_content = True
        try:
            PageEdited.objects.get(page=page)
        except ObjectDoesNotExist:
            need_edited = True
        try:
            PageRedirect.objects.get(page=page)
        except ObjectDoesNotExist:
            need_redirect = True
        if need_content or need_edited or need_redirect:
            print dt(), page.title
            site = pywikibot.Site('ru')
            # wiki_page = pywikibot.Page(site, page.title.decode('utf-8'))
            wiki_page = pywikibot.Page(site, page.title)
            try:
                if need_content:
                    content = wiki_page.get(get_redirect=True)
                    content = remove_utf8mb4(content)
                    PageContent.objects.create(page=page, content=content)
                    print '- contents added'
                if need_redirect:
                    redirect = wiki_page.isRedirectPage()
                    PageRedirect.objects.create(page=page, redirect=redirect)
                    print '- redirect added'
                    # redirects.append(PageRedirect(page=page,
                    #                                redirect=redirect))
                    # if len(redirects) > 1000:
                    #     PageRedirect.objects.bulk_create(redirects)
                    #     redirects = []
                    #     print '#' * 30, 'redirects added'
                if need_edited:
                    edited = convert_wiki_date(wiki_page.editTime())
                    # edited = make_naive(wiki_page.editTime(), pytz.UTC)
                    # edited = make_aware(wiki_page.editTime(), None)
                    edited = make_aware(edited, pytz.UTC)
                    PageEdited.objects.create(page=page, edited=edited)
                    print '- edited added'
                    # edits.append(PageEdited(page=page, edited=edited))
                    # if len(edits) > 1000:
                    #     PageEdited.objects.bulk_create(edits)
                    #     edits = []
                    #     print '#' * 30, 'edits added'
            except NoPage:
                print dt(), '#' * 20, page.title, 'DELETED?'  # todo: remove?
                content = page.content if not need_content else ''
                redirect = page.page_redirect.redirect if not need_redirect else None
                edited = page.page_edited.edited if not need_edited else None
                PageDeleted.objects.create(page=page.pk, title=page.title,
                                            content=content, redirect=redirect,
                                            edited=edited)
                # if not need_content:
                #     page.page_content.delete()
                page.delete()

    PageRedirect.objects.bulk_create(redirects)
    PageEdited.objects.bulk_create(edits)
def process_recent():
    print 'process_recent()'
    i = 0
    edited = PageEdited.objects.order_by('-edited')[0]
    print 'updating untill:', edited.edited
    end = datetime(edited.edited.year,
                   edited.edited.month,
                   edited.edited.day,
                   edited.edited.hour,
                   # edited.edited.minute,
                   )
    g = RecentChangesPageGenerator(end=end)
    for item in g:
        title = item.title()
        if ':' in title:
            continue
        i += 1
        # print title
        # print repr(title)
        # print repr(title.encode('utf-8'))
        # print repr(title.encode('utf-16be'))
        # print repr(title.encode('utf-32be'))
        # # print repr(title.encode('utf-16').decode('utf-16'))
        # page = Page.objects.create(title=title)
        # page = Page.objects.create(title=title.encode('utf-16be'))
        # page = Page.objects.create(title=title.encode('utf-16'))
        # page = Page.objects.create(title=title.encode('utf-8'))
        # # page = Page.objects.create(title=title.encode('utf-8'))
        # # page, created = Page.objects.get_or_create(title=title)
        # continue

        try:
            page, created = Page.objects.get_or_create(title=title)
        except Exception:
            print dt(), title, '-', '@' * 30, 'bad title'
            continue

        try:
            edited = convert_wiki_date(item.editTime())
            edited = make_aware(edited, pytz.UTC)
        except NoPage:
            print dt(), title, '-', 'DELETED', '#' * 20
            # todo: А если страница еще не была добавлена в БД? Т.е. создание/удаление в пределах одной проверки
            content, created = PageContent.objects.get_or_create(page=page)
            edited, created = PageEdited.objects.get_or_create(page=page)
            redirect, created = PageRedirect.objects.get_or_create(page=page)
            PageDeleted.objects.create(page=page.pk, title=page.title,
                                        content=content.content,
                                        edited=edited.edited,
                                        redirect=redirect.redirect)
            page.delete()
            continue
        content = item.get(get_redirect=True)
        content = remove_utf8mb4(content)
        redirect = item.isRedirectPage()

        print dt(), edited, '-', i, '-', title, '| pk =', page.pk
        if created:
            print '- page CREATED: %s' % page.pk

        # try:
        #     page = Page.objects.get(title__exact=title)
        # except ObjectDoesNotExist:
        #     page = Page.objects.create(title=title)

        data, created = PageContent.objects.get_or_create(page=page)
        if data.content != content:
            data.content = content
            data.save()
            print '- contents updated'

        data, created = PageRedirect.objects.get_or_create(page=page)
        if data.redirect != redirect:
            data.redirect = redirect
            data.save()
            print '- redirect updated'

        data, created = PageEdited.objects.get_or_create(page=page)
        if data.edited != edited:
            data.edited = edited
            data.save()
            print '- edited updated'
Exemple #14
0
    def process_item(self, item, i):
        # todo: create external mechanism of pausing work (actual for big processors)
        try:
            title = item.title()
        except InvalidTitle:
            print 'Wrong title', '#' * 120
            return
        if ':' in title:  # todo: we need this only for RecentProcessor
            if title.startswith(u"Шаблон:"):
            # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"):
                # print '-' * 40
                pass
            else:
                return
        if self.readonly:
            return self.process_item_readonly(item, title, i)
        # if Page.objects.filter(title=title):
        #     print dt(), title, '- exists'
        #     return
        page = self.get_page(title, i)
        if not page:
            return
        content, content_to_db, edited, redirect = self.get_data(item)
        if not edited:
            # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20
            print dt(), '& PAGE WAS DELETED          - %d - & %s  // pk=%d' \
                        % (i, transliterate(title), page.pk)
            # print dt(), transliterate(title), '-', 'DELETED', '#' * 10
            # print dt(), '-', 'DELETED', '#' * 20
            page.delete_and_log()
            return
        # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk
        if not self.output_interval or not i % self.output_interval:
            print dt(),
            if ':' in title:
                print ':',
            else:
                print '#',
            print edited, '-', i, '-',
            if ':' in title:
                print ':',
            else:
                print '#',
            print transliterate(title), ' // pk=%s' % page.pk
        # print dt(), edited, '-', i, '-', '| pk =', page.pk
        log = transliterate(title)

        oldest = next(item.revisions(reverseOrder=True, total=1, content=True))
        created_at = aware(convert_wiki_date(oldest.timestamp))
        created_author = oldest.user
        created_lang = '?'
        if oldest.text is None:
            created_lang = '??'
        else:
            # print oldest.text
            # print repr(oldest.text)
            # print
            m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}',
                          oldest.text, flags=re.MULTILINE | re.UNICODE)
            m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text)
            m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE)
            if m:
                created_lang = m.group(1)
            elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE):
                created_lang = u'Праславянский'
            elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = u'eo'
            elif m2:
                created_lang = m2.group(1)
            elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'de'
            elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'fr'
            elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'it'
            elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'nl'
            elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'nl'
            elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'de'
            elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'it'
            elif m_new:
                created_lang = m_new.group(1)
            elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE):
                created_lang = u'-'
            else:
                save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8'))

        # print
        # print transliterate(title), created_at
        # print transliterate(oldest.user), transliterate(created_lang)
        # print

        self.update_data(page, content, content_to_db, edited, redirect, log,
                         created_at, created_author, created_lang)
        if ':' in title:
            return
        return edited