Ejemplo n.º 1
0
    def tpl_action(self, page, tpl, title, morph, lang, params):
        title = title.strip()
        # empty_templates = [
        #     u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina",
        #     u"сущ ru m a", u"сущ ru f a", u"сущ ru n a",
        # ]
        # if title in empty_templates:
        #     print page.title
        #if morph != u'сущ':
        #    continue
        # print '-' * 80

        # output
        # self.counter += 1
        # if not self.counter % 50:
        #     save_wiki_page(u"Участник:Vitalik/Словоформы/v2/А1/%s" % self.n,
        #                    self.content, u"Получение списка словоформ")
        #     self.n += 1
        #     self.content = ''
        # # print "\n\n== [[%s]] ==" % page.title
        # self.content += "\n\n== [[%s]] ==\n" % page.title

        # print title
        # return
        #continue
        call_params, call_numeric = process_call_params(params)
        try:
            template = TemplateInflection.objects.get(title=title)
        except ObjectDoesNotExist:
            return  # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ")
        if not template.forms:
            print title, '- maybe wrong template'
            return
        print title
        tpl_forms = get_dict_from_text(template.forms)
        tpl_params = get_dict_from_text(template.params)
        for key, value in tpl_params.items():
            tpl_params[key] = universal_process_template(value, call_params)
        for key, value in tpl_forms.items():
            value = universal_process_template(value, call_params)
            value = process_template(value, tpl_params, key, morph)
            value = divide_words(value)
            tpl_forms[key] = value

        form_results = dict()

        for key, values in tpl_forms.items():
            if morph == u'сущ':
                if key == 'nom-sg':
                    value = values[0]
                    if remove_stress(value) and page.title != remove_stress(value):
                        pass  # todo: мсправить их все-таки
                        # print
                        # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8'))
                        # print page.title
                        # print remove_stress(value)
                        # print repr(page.title)
                        # print repr(remove_stress(value))

        for key, values in tpl_forms.items():
            # if re.search('[a-z]', ' '.join(values)):  #todo! find them!
            #     print values
            if not values or len(values) == 1 and not values[0]:
                continue
            form_params = get_form_params(morph, key, tpl_params)
            if not form_params:
                continue
            for value in values:
                if remove_stress(value) == page.title:
                    continue
                form_results.setdefault(value, list())
                form_results[value].append(form_params.copy())

        # todo: если полностью совпали, то тоже удалять
        join_form_results(morph, form_results)

        db_forms = list()
        for value, items in form_results.items():
            for form_params in items:
                if not form_params:
                    continue
                form_template = u"{{Форма-%s\n|язык=ru\n|база=%s\n" % (morph, page.title)
                for param_name, param_value in form_params.items():
                    form_template += "|%s=%s\n" % (param_name, param_value)
                form_template += u"|слоги={{по-слогам|%s}}\n" % value
                form_template += "}}"

                # output
                # # print "'''[[%s]]'''" % remove_stress(value)
                # # print form_template
                # self.content += "'''[[%s]]'''\n" % remove_stress(value)
                # self.content += "%s\n" % form_template
                # # print

                # db_form, created = WordForm.objects.get_or_create(
                # # if created:

                # WordForm.objects.create(
                #     title=remove_stress(value),
                #     base=page.title,
                #     value=value,
                #     template=form_template
                # )

                db_form = WordForm(
                    title=remove_stress(value),
                    base=page.title,
                    value=value,
                    template=form_template
                )
                db_forms.append(db_form)

                # db_counter += 1
                # if len(db_forms) > 1000:
                #     WordForm.objects.bulk_create(db_forms)
                #     print dt(), '> forms added:', db_counter
                #     db_forms = []
        WordForm.objects.bulk_create(db_forms)
Ejemplo n.º 2
0
def process_slovoforms_new():
    print "process_slovoforms()"
    i = 0
    db_forms = []
    db_counter = 0
    for page in Page.objects.iterate():
        i += 1
        if not i % 1000:
            print dt(), "processed pages:", i
        # try:
        #    content = PageContent.objects.get(page=page).content
        # except ObjectDoesNotExist:
        #    print u'× does not exist'
        #    continue
        content = page.content
        p = re.compile(
            u"""(\{\{
                (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+)  # заголовок
                (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE,
        )
        parts = p.findall(content)
        # if page.title != u'житься':
        #    continue
        # if page.title != u'плавни':
        #    continue
        # print page.title
        # for part in parts:
        #    print '==='
        #    for i in part:
        #        print i
        #        print '---'
        #    print '==='
        for part in parts:
            # print part[0]
            m = p.search(part[0])
            if m:
                title = m.group("title").strip()
                empty_templates = [
                    u"сущ ru m ina",
                    u"сущ ru f ina",
                    u"сущ ru n ina",
                    u"сущ ru m a",
                    u"сущ ru f a",
                    u"сущ ru n a",
                ]
                if title in empty_templates:
                    print page.title

                morph = m.group("morph")
                # if morph != u'сущ':
                #    continue
                # print page.title
                # continue
                call_params = process_call_params(m.group("params"))
                try:
                    template = TemplateInflectionData.objects.get(title=title)
                except ObjectDoesNotExist:
                    continue  # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ")
                tpl_forms = get_dict_from_text(template.forms)
                tpl_params = get_dict_from_text(template.params)
                for key, value in tpl_params.items():
                    tpl_params[key] = universal_process_template(value, call_params)
                for key, value in tpl_forms.items():
                    value = universal_process_template(value, call_params)
                    value = process_template(value, tpl_params, key, morph)
                    value = divide_words(value)
                    tpl_forms[key] = value

                form_results = dict()

                for key, values in tpl_forms.items():
                    if morph == u"сущ":
                        if key == "nom-sg":
                            value = values[0]
                            if remove_stress(value) and page.title != remove_stress(value):
                                pass  # todo: мсправить их все-таки
                                # print
                                # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8'))
                                # print page.title
                                # print remove_stress(value)
                                # print repr(page.title)
                                # print repr(remove_stress(value))

                for key, values in tpl_forms.items():
                    # if re.search('[a-z]', ' '.join(values)):  #todo! find them!
                    #     print values
                    if not values or len(values) == 1 and not values[0]:
                        continue
                    form_params = get_form_params(morph, key, tpl_params)
                    if not form_params:
                        continue
                    for value in values:
                        if remove_stress(value) == page.title:
                            continue
                        form_results.setdefault(value, list())
                        form_results[value].append(form_params.copy())

                # todo: если полностью совпали, то тоже удалять
                join_form_results(morph, form_results)

                for value, items in form_results.items():
                    for form_params in items:
                        if not form_params:
                            continue
                        form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title)
                        for param_name, param_value in form_params.items():
                            form_template += "|%s=%s\n" % (param_name, param_value)
                        form_template += "}}"
                        # print remove_stress(value)
                        # print form_template
                        # print
                        # db_form, created = WordForm.objects.get_or_create(
                        db_form = WordForm(
                            title=remove_stress(value), base=page.title, value=value, template=form_template
                        )
                        # if created:
                        db_forms.append(db_form)
                        db_counter += 1
                        if len(db_forms) > 1000:
                            WordForm.objects.bulk_create(db_forms)
                            print dt(), "> forms added:", db_counter
                            db_forms = []
    WordForm.objects.bulk_create(db_forms)