Ejemplo n.º 1
0
 def get_data(self, item):
     try:
         content = item.get(get_redirect=True)
         content_to_db = remove_utf8mb4(content)
         edited = aware(convert_wiki_date(item.editTime()))
         redirect = item.isRedirectPage()
         return content, content_to_db, edited, redirect
     except NoPage:
         return None, None, None, None
Ejemplo n.º 2
0
def load_templates():
    import pywikibot
    site = pywikibot.Site('ru')
    category_names = [
        u"Категория:Викисловарь:Шаблоны:Языки",
        u"Категория:Шаблоны:Языковые_заголовки",
    ]
    for category_name in category_names:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            title = article.title()
            m = re.match(u'^Шаблон:-([^-].*)-$', title)
            if m:
                lang = m.group(1)
                print lang
                content = article.get(get_redirect=True)
                edited = aware(convert_wiki_date(article.editTime()))
                LanguageTemplate.objects.get_or_create(lang_code=lang,
                                                       edited=edited,
                                                       content=content)
Ejemplo n.º 3
0
def load_templates():
    import pywikibot
    site = pywikibot.Site('ru')
    category_names = [
        u"Категория:Шаблоны:Названия_языков",
        # u"Категория:Шаблоны:Языковые_заголовки",
    ]
    for category_name in category_names:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            title = article.title()
            m = re.match(u'^Шаблон:(?P<case>[Ll])ang-(?P<lang>.+)$', title)
            # if not m:
            #     print title
            if m:
                lang = m.group('lang')
                case = m.group('case')
                lower = case == 'l'
                print lang
                content = article.get(get_redirect=True)
                edited = aware(convert_wiki_date(article.editTime()))
                TemplateLangNames.objects.get_or_create(
                    lang=lang, lower=lower, edited=edited, content=content)
Ejemplo n.º 4
0
        continue
    title = orig_title[len(prefix) :]
    print title
    if title == u"сущ":
        continue

    if not re.match(u"^сущ/[fmn]\d/\d+$", title):
        continue

    # print 'ok'
    # continue
    old = WordInflectionMassEdit.objects.get(title=title)
    old_content = old.content

    new_content = remove_utf8mb4(item.get(get_redirect=True))
    edited = aware(convert_wiki_date(item.editTime()))
    print edited

    update_content = new_content
    if new_content != old_content:
        if u"{{пишу}}" in new_content or u"{{message box" in new_content:
            print u"×" * 20, u"{{пишу}} detected"
            first_edited = None
            break
        changing_started = False
        old_items, old_reports = parse_mass_edit(old_content)
        new_items, new_reports = parse_mass_edit(new_content)
        if len(old_items) != len(new_items) or set(old_items.keys()) != set(new_items.keys()):
            print u"×" * 20, "WRONG DIFFERENCE IN TITLES (COUNTS OR VALUES)"
            continue
        for word in sorted(old_items.keys(), key=lambda x: x[::-1]):
Ejemplo n.º 5
0
    def process_item(self, item, i):
        # todo: create external mechanism of pausing work (actual for big processors)
        try:
            title = item.title()
        except InvalidTitle:
            print 'Wrong title', '#' * 120
            return
        if ':' in title:  # todo: we need this only for RecentProcessor
            if title.startswith(u"Шаблон:"):
            # if title.startswith(u"Категория:") or title.startswith(u"Шаблон:"):
                # print '-' * 40
                pass
            else:
                return
        if self.readonly:
            return self.process_item_readonly(item, title, i)
        # if Page.objects.filter(title=title):
        #     print dt(), title, '- exists'
        #     return
        page = self.get_page(title, i)
        if not page:
            return
        content, content_to_db, edited, redirect = self.get_data(item)
        if not edited:
            # print dt(), title.encode('cp1251'), '-', 'DELETED', '#' * 20
            print dt(), '& PAGE WAS DELETED          - %d - & %s  // pk=%d' \
                        % (i, transliterate(title), page.pk)
            # print dt(), transliterate(title), '-', 'DELETED', '#' * 10
            # print dt(), '-', 'DELETED', '#' * 20
            page.delete_and_log()
            return
        # print dt(), edited, '-', i, '-', title.encode('cp1251'), '| pk =', page.pk
        if not self.output_interval or not i % self.output_interval:
            print dt(),
            if ':' in title:
                print ':',
            else:
                print '#',
            print edited, '-', i, '-',
            if ':' in title:
                print ':',
            else:
                print '#',
            print transliterate(title), ' // pk=%s' % page.pk
        # print dt(), edited, '-', i, '-', '| pk =', page.pk
        log = transliterate(title)

        oldest = next(item.revisions(reverseOrder=True, total=1, content=True))
        created_at = aware(convert_wiki_date(oldest.timestamp))
        created_author = oldest.user
        created_lang = '?'
        if oldest.text is None:
            created_lang = '??'
        else:
            # print oldest.text
            # print repr(oldest.text)
            # print
            m = re.search(u'\{\{-([-\w]+|Праславянский)-(?:\|[^}]+)?\}\}',
                          oldest.text, flags=re.MULTILINE | re.UNICODE)
            m2 = re.search(u'\{\{заголовок\|(en|tr|jbo|la|it|es|fo|da|de|pt|hr|pl|fi|lv|nl|sv|no|io|gd|az|ms|id|nv|nds|nah|hu|nrm|vls|fur|ga|hu|lb|hsb|li|tpi|gv|fr|cy|fy|sc|fo|zu|sw|mg|oc|ca|qu|ln|eo|so|cs|uz|et|vo|ku|su|sk|mi|kw|bar|br|an|sq|bs|af)\|add=\w*\}\}', oldest.text)
            m_new = re.search(u'\{\{NEW\|lang=([-a-z]+)\|cat=', oldest.text, flags=re.UNICODE | re.IGNORECASE)
            if m:
                created_lang = m.group(1)
            elif re.search(u'^= *Праславянский', oldest.text, flags=re.MULTILINE | re.UNICODE):
                created_lang = u'Праславянский'
            elif re.search(u'=<div style="background-color0?:#\{\{h1c\}\}">Эсперанто</div>=', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = u'eo'
            elif m2:
                created_lang = m2.group(1)
            elif re.search(u'== *\[\[?(:w)?:en:[^|]+\|английский\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'== *\[\[?(:w)?:de:[^|]+\|немецкий\]\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'de'
            elif re.search(u'== *\[?\[?(английский|english)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'== *\[?\[?(французский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'fr'
            elif re.search(u'== *\[?\[?(италь?янский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'it'
            elif re.search(u'== *\[?\[?(Нидерландский)\]?\]? *== *\n', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'nl'
            elif re.search(u'\{\{(английский|en)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'en'
            elif re.search(u'\{\{(Нидерландский|nl)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'nl'
            elif re.search(u'\{\{(немецкий|de)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'de'
            elif re.search(u'\{\{(it)\}\}', oldest.text, flags=re.UNICODE | re.IGNORECASE):
                created_lang = 'it'
            elif m_new:
                created_lang = m_new.group(1)
            elif re.search(u'#(redirect|перенаправление)', oldest.text, flags=re.MULTILINE | re.UNICODE | re.IGNORECASE):
                created_lang = u'-'
            else:
                save_file(settings.FILES_PATH + "/errors/created_lang/%s.txt" % page.pk, oldest.text.encode('utf-8'))

        # print
        # print transliterate(title), created_at
        # print transliterate(oldest.user), transliterate(created_lang)
        # print

        self.update_data(page, content, content_to_db, edited, redirect, log,
                         created_at, created_author, created_lang)
        if ':' in title:
            return
        return edited