def fix_for_soshial():
    # Убрать НП из ПричСтрад
    total_count = 0
    categories = [
        u"Категория:Шаблоны словоизменений/Глаголы/Несовершенный вид",
        u"Категория:Шаблоны словоизменений/Глаголы/Совершенный вид",
    ]
    for category_name in categories:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            title = article.title()
            # print title
            content = article.get()

            p = re.compile(u'(?P<source>(?P<prefix>\|ПричСтрад *= *)\{\{#if:\{\{\{НП\|\}\}\}\|(?P<if_yes>[^|]*)\|(?P<if_no>.*)\}\})', re.UNICODE)
            if not total_count % 50:
                print dt(), total_count
            total_count += 1
            m = p.search(content)
            # if title in [u'Участник:Soshial/sandbox2']:
            #     continue
            if m:
                print title
                # print '=' * 100
                print m.group('source').strip()
                # print '=' * 100
                # print content
                content = p.sub(u'\g<prefix>\g<if_no>', content)
                # print '=' * 100
                # print content
                # article.put(content, u'Добавление параметра "Инфинитив" к шаблонам (на основании будущей формы)', minorEdit=False)
                break
        print total_count
Ejemplo n.º 2
0
def load_contents():
    pages = []
    i = 0
    j = 0
    for page in Page.objects.iterate():
        # print dt(), page.title
        try:
            # j += 1
            # if j > 84000:
                PageContent.objects.get(page_id=page.id)
                # print 'exists'
        except ObjectDoesNotExist:
            print dt(), page.title
            # print repr(page.title)
            title = page.title
            # try:
            #     title = page.title.encode('utf-8')
            # except UnicodeDecodeError:
            #     print '#####'
            #     continue
            # if i > 700:
            url = "http://dump.a-lib.net/wikt/%s" % urllib.quote_plus(title)
            url = url.replace('+', '%20')
            content = urllib.urlopen(url).read()
            # print repr(content)
            content = remove_utf8mb4(content.decode('utf-8'))
            PageContent.objects.create(pk=page.pk, page=page, content=content)
            # pages.append(PageContent(page=page, content=content))
            i += 1
            if not i % 100:
                print dt(), i
                # PageContent.objects.bulk_create(pages)
                pages = []
                sleep(1)
Ejemplo n.º 3
0
 def action(self, page, **kwargs):
     content = kwargs['content']
     parts = re.findall('(^|\n)(=[^=\n]+=)\n', content)
     for part in parts:
         found = part[1]
         if found in [u'= Буква (латиница) =', u'= Буква (кириллица) =']:
             continue
         # print "* [[%s]]: <code><nowiki>%s</nowiki></code>" % (page.title, found)
         m = re.match(u'^= *\{\{-(?P<lang>[-a-z]+|Праславянский)-(?P<remove>\|([^}]+|\{\{PAGENAME\}\}|))?\}\} *=$',
                      found, re.IGNORECASE)
         if not m:
             print found
         if m:
             remove = m.group('remove')
             # if remove:
             #     print page.title, remove
             if remove == '|nocat':
                 continue
             lang = m.group('lang')
             # print "* %s: %s" % (page.title, lang)
             if lang != 'ru':
                 continue
             old_header = m.group(0)
             new_header = "= {{-%s-}} =" % lang
             if old_header == new_header:
                 continue
             self.changed += 1
             print dt(), 'changed:', self.changed
             wiki_content = get_wiki_page_content(page.title)
             new_wiki_content = wiki_content.replace(old_header, new_header)
             save_wiki_page(page.title, new_wiki_content,
                            "викификация заголовка первого уровня",
                            wait=5)
def perfect_verbs():
    total_count = 0
    category = pywikibot.Category(site, u"Категория:Шаблоны_словоизменений/Глаголы/Совершенный вид")
    for article in category.articles():
        title = article.title()
        # print title
        content = article.get()

        #p = re.compile(u'(?P<source>\|hide-text=\{\{\{hide-text\|\}\}\})', re.UNICODE)
        p = re.compile(u'(?P<source>\|hide-text=\{\{\{hide-text\|\}\}\})', re.UNICODE)
        if not total_count % 50:
            print dt(), total_count
        total_count += 1
        m = p.search(content)
        #if title in [u'Участник:Soshial/sandbox2']:
        #    continue
        if not m:
            print title
        #if m:
        #    print title
            #print m.group('inf').strip()
            #content = p.sub(u'\g<source>|Инфинитив = \g<inf>', content)
            #article.put(content, u'Добавление параметра "Инфинитив" к шаблонам (на основании будущей формы)', minorEdit=False)
            # break
    print total_count
Ejemplo n.º 5
0
def find_numeric_params():
    print "process_slovoforms()"
    i = 0
    for page in Page.objects.iterate():
        i += 1
        if not i % 1000:
            print dt(), "processed pages:", i
        content = page.content
        p = re.compile(
            u"""(\{\{
                (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+)  # заголовок
                (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE,
        )
        parts = p.findall(content)
        for part in parts:
            # print part[0]
            m = p.search(part[0])
            if m:
                title = m.group("title").strip()
                morph = m.group("morph")
                call_params, call_numeric = process_call_params(m.group("params"))
                # for key, value in call_params.items():
                #     print key, value
                if call_numeric:
                    print
                    print page.title
                    for numeric in call_numeric:
                        print "-", numeric
Ejemplo n.º 6
0
def replacement():
    total_count = 0
    categories = [
        u"Категория:Шаблоны_словоизменений/Глаголы/Совершенный вид",
        u"Категория:Шаблоны_словоизменений/Глаголы/Несовершенный_вид",
    ]
    site = pywikibot.Site('ru')
    for category_name in categories:
        category = pywikibot.Category(site, category_name)
        for article in category.articles():
            title = article.title()
            print
            print title, u'→ get'
            if title in [u'Участник:Soshial/sandbox2']:
                continue
            content = article.get()
            p = re.compile(u'(?P<source>\|(Я|Ты) \(прош.\) *= *(?P<value>[^|]+))', re.UNICODE)
            # p = re.compile(u'(?P<source>\|Будущее)', re.UNICODE)
            if not total_count % 50:
                print dt(), total_count
            total_count += 1
            parts = p.findall(content)
            new_content = content
            for part in parts:
                # print title, u'→ try'
                print part[0].strip()
                new_part = part[0].replace(', ', '<br />')
                new_content = new_content.replace(part[0], new_part)
                # content = p.sub(u'\g<source>|Инфинитив = \g<inf>', content)
                # article.put(content, u'Добавление параметра "Инфинитив" к шаблонам (на основании будущей формы)', minorEdit=False)
                # break
            if content != new_content:
                print u'→ changed!', '!' * 80
                article.put(new_content, u'Замена ", " на "<br />" в параметрах "Я (прош.)" и "Ты (прош.)"', minorEdit=False)
        print total_count
Ejemplo n.º 7
0
def extract_change_templates():
    print "extract_change_templates()"
    i = 0
    data = []
    for page in Page.objects.iterate(prefetch=["page_content"]):
        # for page in Page.objects.iterate():
        i += 1
        if not i % 1000:
            print dt(), "processed pages:", i
        content = page.content
        p = re.compile(
            u"""(\{\{
                (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+)  # заголовок
                (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE,
        )
        parts = p.findall(content)
        # print parts
        # continue
        titles = []
        for part in parts:
            m = p.search(part[0])
            if m:
                title = m.group("title").strip()
                titles.append(title)
        data.append(Page_ChangeTemplates(page=page, change_templates=" | ".join(titles)))
        if len(data) > 1000:
            Page_ChangeTemplates.objects.bulk_create(data)
            print dt(), "> data added:", len(data)
            data = []
    Page_ChangeTemplates.objects.bulk_create(data)
Ejemplo n.º 8
0
def get_data_from_that_site():
    print "get_data_from_that_site()"
    i = 0
    data = []
    for page in Page.objects.iterate(prefetch=["page_changetemplates", "page_starlingzaliznyak"]):
        i += 1
        if not i % 100:
            print dt(), "processed pages:", i
        if u"­" in page.title:
            continue
        change_templates = page.page_changetemplates.change_templates
        if u"сущ" in change_templates or u"гл" in change_templates:
            try:
                page.page_starlingzaliznyak
            except ObjectDoesNotExist:
                # print page.title
                base, info, morph = get_page(page.title)
                data.append(
                    Page_StarlingZaliznyak(
                        page=page,
                        word=page.title,
                        base=" | ".join(base),
                        info=" | ".join(info),
                        morph=" | ".join(morph),
                    )
                )
                if len(data) > 1000:
                    Page_StarlingZaliznyak.objects.bulk_create(data)
                    print dt(), "> data added:", len(data)
                    data = []
    Page_StarlingZaliznyak.objects.bulk_create(data)
Ejemplo n.º 9
0
def find_empty_words():
    print "process_slovoforms()"
    i = 0
    count = 0
    for page in Page.objects.iterate(prefetch=["page_content"]):
        i += 1
        if not i % 1000:
            print dt(), "processed pages:", i
        content = page.content
        p = re.compile(
            u"""(\{\{
                (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+)  # заголовок
                (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE,
        )
        parts = p.findall(content)
        for part in parts:
            m = p.search(part[0])
            if m:
                title = m.group("title").strip()
                empty_templates = [
                    u"сущ ru m ina",
                    u"сущ ru f ina",
                    u"сущ ru n ina",
                    u"сущ ru m a",
                    u"сущ ru f a",
                    u"сущ ru n a",
                ]
                if title in empty_templates:
                    count += 1
                    print count, page.title
Ejemplo n.º 10
0
def get_dictionary_words(dictionary_id):
    print dt(), 'Loading words (%s) - started' % dictionary_id
    words = list()
    for item in Word_Value.objects.filter(dictionary_id=dictionary_id):
        words.append(item.value)
    print dt(), 'Loading words - finished'
    return words
Ejemplo n.º 11
0
 def action(self, page, **kwargs):
     # print '=' * 80
     # print dt(), page.title
     content = kwargs['content']
     m = re.search(u'=== Смотреть также ===\n([^={]*)', content, flags=re.MULTILINE | re.DOTALL)
     if m:
         block_content = m.group(1)
         # print block_content.strip()
         for remove in removings:
             if remove in block_content:
                 print '=' * 80
                 print dt(), page.title
                 print block_content.strip()
                 old_content = get_wiki_page_content(page.title)
                 new_content = re.sub(
                     u'=== Смотреть также ===\n\s*%s\n' % remove.replace('*', r'\*').replace('[', r'\[').replace(']', r'\]'),
                     u'', old_content)
                 if old_content != new_content:
                     desc = u'Удаление "Смотреть также" со списком имён'
                     save_wiki_page(page.title, new_content, desc, wait=5)
                     print 'saved'
                 else:
                     print 'not changed'
     else:
         print u'×××'
Ejemplo n.º 12
0
def download(url):
    for tries in range(3):
        try:
            return urllib.urlopen(url).read()
        except IOError:
            print dt(), '#', 'Download failed, tries:', tries
            time.sleep(5)
Ejemplo n.º 13
0
 def bulk(self, items, model=None, chunk_size=1000):
     if not model:
         model = self.model
     processed = 0
     for chunk in chunks(items, chunk_size):
         processed += len(model.objects.bulk_create(chunk))
         print dt(), '-> Processed:', processed
Ejemplo n.º 14
0
def save_db_to_file():
    i = 0
    print dt(), i
    words = dict()
    ## for item in Word_Description.objects.all()[:10000]:  # .prefetch_related('details')
    ## for item in Word_Description.objects.prefetch_related('word', 'word__details').all()[:10000]:
    # for item in Word_Description.objects.iterate(100000):
    output = list()
    for item in Word_Value.objects.iterate(100000):
    #for item in Word_Description.objects.iterate(100000, ['word', 'word__details']):
    ## for item in Word_Description.objects.iterate(100):
        # word = "%s " % item.value
    #    word = "%s %s" % (item.value, item.word.details.in_academic_lopatin)
    #    # print word
    #     if item.dictionary_id in [12, 13, 16]:
    #         continue
        i += 1
        # print repr(item.value)
        output.append("%s|%s" % (item.value, item.dictionary_id))
        if not i % 10000:
            print dt(), i
            # break
        # if i > 300000:
        #     break

    save_file('db_data.txt', '\n'.join(output), 'utf-8')
Ejemplo n.º 15
0
 def action(self, page, **kwargs):
     # print '=' * 80
     # print dt(), page.title
     content = kwargs['content']
     m = re.search(u'=== Иноязычные аналоги ===\n(.*)\n=', content,
                   flags=re.MULTILINE | re.DOTALL)
     if m:
         block_content = m.group(1)
         if u'=== Перевод ===' in content:
             # print u'перевод есть :)'
             # print '=' * 80
             # print dt(), page.title
             m2 = re.search(u'=== Перевод ===(.*)\n=== Иноязычные аналоги ===\n(.*?)\n=',
                            content, flags=re.MULTILINE | re.DOTALL)
             if m2:
                 block1 = m2.group(1)
                 block2 = m2.group(2)
                 if "\n=" in block1 or "\n=" in block2:
                     print u'×' * 200
                     return
                 mb1 = big_empty.search(block1)
                 mb2 = big_empty.search(block2)
                 if mb2:  #if mb1 and mb2:
                     # print dt(), page.title
                     # old_content = content
                     # old_content = get_wiki_page_content(page.title)
                     # new_content = old_content.replace(
                     #     u'=== Иноязычные аналоги ===\n%s\n' % block2,
                     #     u'')
                     #
                     # # new_block2 = re.sub(u'\{\{перев-блок\|*\n', u'{{перев-блок|Иноязычные аналоги|\n', block2)
                     # # new_block1 = big_empty.sub(new_block2, block1)
                     # new_block1 = re.sub(u'\{\{перев-блок\|*\n', u'{{перев-блок|Иноязычные аналоги|\n', block1)
                     # new_content = new_content.replace(
                     #     u'=== Перевод ===%s' % block1,
                     #     u'=== Перевод ===\n%s\n' % new_block1.strip(),
                     # )
                     # if old_content != new_content:
                     #     # desc = u'Удаление пустого блока "Перевод" и добавление "Иноязычные аналоги"'
                     #     desc = u'Удаление пустого блока "Иноязычные аналоги"'
                     #     save_wiki_page(page.title, new_content, desc, wait=5)
                     #     # self.stop()
                     pass
                 print
                 print '=' * 100
                 print dt(), page.title
                 print '-' * 100
                 print block1
                 print '-' * 100
                 print block2
                 print '-' * 100
             else:
                 # print dt(), page.title, u'×××', u'неподходящее взаимное расположение'
                 pass
         else:
             # print dt(), page.title, u'×××', u'перевода нет :('
             pass
     else:
         # print dt(), page.title, u'×××', u'нет заголовка после аналогов?'
         pass
Ejemplo n.º 16
0
 def process_items(self):
     i = 0
     site = pywikibot.Site('ru')
     print dt(), 'processing some pages in database'
     for page in Page.objects.filter(title__in=titles):
         i += 1
         item = pywikibot.Page(site, page.title)
         self.process_item(item, i)
Ejemplo n.º 17
0
def get_dictionaries_words(dictionaries_data):
    dictionary_ids = dictionaries_data.keys()
    print dt(), 'Loading words (%s) - started' % dictionary_ids
    words = list()
    for item in Word_Value.objects.filter(dictionary_id__in=dictionary_ids):
        dictionary_name = dictionaries_data[item.dictionary_id]
        words.append("%s|%s" % (item.value, dictionary_name))
    print dt(), 'Loading words - finished'
    return words
Ejemplo n.º 18
0
 def process_items(self):
     print dt(), 'processing all unknown created language pages in database'
     site = pywikibot.Site('ru')
     i = 0
     for page_created in PageCreated.objects.filter(lang='?'):
         i += 1
         title = page_created.page.title
         item = pywikibot.Page(site, title)
         self.process_item(item, i)
Ejemplo n.º 19
0
def just_go_through():
    print "just_go_through()"
    i = 0
    count = 0
    for page in Page.objects.iterate(prefetch=["page_content"]):
        # for page in Page.objects.iterate():
        i += 1
        if not i % 1000:
            print dt(), "processed pages:", i
        content = page.content
Ejemplo n.º 20
0
 def run(self):
     if self.dont_start():
         print dt(), "don't need to start"
         return
     self.before()
     for item in self.iterator():
         kwargs = self.get_kwargs(item)
         self.action(item, **kwargs)
         if self.stopped:
             break
     self.after()
Ejemplo n.º 21
0
 def tpl_action(self, page, tpl, title, morph, lang, params):
     word = page.title
     template_title = title.strip()  # template title
     print dt(), word, u'—', template_title,
     self.groups.setdefault(template_title, list())
     self.groups[template_title].append(page.title)
     kind, gender, num = parse_template_title(template_title)
     WordInflection.objects.bulk_add(
         WordInflection(word=word, template=template_title,
                        content=tpl, gender=gender, kind=kind, num=num)
     )
Ejemplo n.º 22
0
 def get_recent_generator(self):
     print
     print '=' * 120
     print dt(), 'processing recent pages'
     print '=' * 120
     print
     end = self.get_end_date()
     if end:
         end = datetime(end.year, end.month, end.day, end.hour, end.minute)
     print 'updating until:', end
     return RecentChangesPageGenerator(start=self.start_date, end=end)
Ejemplo n.º 23
0
def run_after_checkers():
    s = datetime.now()
    checkers = [
        CfLatinCandidates,
        CfCaseCandidates,
        CfEYoCandidates,
        CfAllCandidates,
        CfSpecialPages,
    ]
    CheckerRunner(checkers).run()
    print dt(), datetime.now() - s