Beispiel #1
0
def process_slovoforms_old():
    print "process_slovoforms()"
    i = 0
    keys = set()
    for page in Page.objects.iterate():
        i += 1
        # if i > 100:
        #    break
        if not i % 10000:
            print dt(), i
        try:
            content = PageContent.objects.get(page=page).content
        except ObjectDoesNotExist:
            print u"× does not exist"
            continue
        # morph = u'сущ'
        # m = re.search(u'(\{\{(?P<title>гл ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})', content, flags=re.UNICODE + re.DOTALL)
        # todo: этих блоков (шаблонов словоизменения) может быть несколько!!
        m = re.search(
            u"(\{\{(?P<title>(?P<morph>сущ|гл) ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})",
            content,
            flags=re.UNICODE + re.DOTALL,
        )
        # print page.pk, m
        if m:
            title = m.group("title").strip()
            morph = m.group("morph")
            source_params = m.group("params")
            source_params = re.sub(u"\{\{по[- ]слогам[^}]+\}\}", "", source_params)
            source_params = source_params.split("|")
            source_params = map(lambda x: x.strip(), source_params)
            source_params = filter(len, source_params)
            # print
            # print '=' * 20
            # print page.title
            # print title
            source_params = get_dict_from_lines(source_params)
            keys |= set(source_params.keys())
            # for key, value in source_params.items():
            #    print key, '=', value
            # print '-' * 20

            try:
                template = TemplateInflectionData.objects.get(title=title)
            except ObjectDoesNotExist:
                # print ('#' * 120 + '\n') * 20
                continue  # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ")
            forms = get_dict_from_text(template.forms)
            params = get_dict_from_text(template.params)
            # print template.forms
            # print '-' * 20
            # for key, value in forms.items():
            #     print key, '=', value
            # print '-' * 20

            for param, value in params.items():
                params[param] = process_template(value, source_params)
            # for key, value in params.items():
            #      print key, '=', value
            # print '-' * 20

            for form, value in forms.items():
                value = process_template(value, source_params)
                forms[form] = divide_words(value)
                # todo: буду — всегда инфинитив?
                # todo: не добавлять их в результаты!!
                future = [
                    u"буду/будешь… ",
                    u"буду/будешь... ",
                    u"буду, будешь, будем… ",
                    u"буду, будешь, будет… ",
                    u"буду, будешь, будет ",
                    u"будет ",
                    u"будет… ",
                    u"буду ",
                ]
                value = value.strip()
                for prefix in future:
                    if value[: len(prefix)] == prefix:
                        value = value[len(prefix) :]
                        if page.title != value.replace(u"́", ""):
                            pass
                            # print
                            # print page.title
                            # print value.replace(u'́', '')
                            # print repr(page.title)
                            # print repr(value.replace(u'́', ''))

            sv = ""
            if morph == u"гл":
                try:
                    sv = params[u"вид"][0]
                except KeyError:
                    continue  # todo: how it is possible?

            results = dict()

            for key, values in forms.items():
                # if re.search('[a-z]', ' '.join(values)):  #todo! find them!
                #     print values
                if not values or len(values) == 1 and not values[0]:
                    continue
                # print key, '=', ' | '.join(values).replace(u'́', '')

                dest_params = dict()
                if morph == u"гл":
                    # todo: if "Будущее" -> continue
                    # todo: 123 лицо
                    # todo: род по порядку следования (в прошлом времени)
                    if key in [u"Прич", u"ПричНаст", u"ПричПрош", u"ПричСтрад", u"ПричСтрадПрош"]:
                        # todo: время тоже учитывать?
                        dest_params[u"прич"] = 1
                    if key in [u"Деепр", u"ДеепрНаст", u"ДеепрПрош", u"ДеепрНастПрош"]:
                        # todo: время тоже учитывать?
                        dest_params[u"деепр"] = 1
                    if key in [u"Будущее"]:
                        pass
                    if u"(прош.)" in key:
                        dest_params[u"время"] = u"пр"
                    if u"(повел.)" in key:
                        dest_params[u"накл"] = u"п"
                    if u"Мы" in key or u"Вы" in key or u"Они" in key:
                        dest_params[u"число"] = u"мн"
                    if u"Я" in key or u"Ты" in key or u"она" in key:
                        dest_params[u"число"] = u"ед"
                    if u"Я" in key or u"Мы" in key:
                        dest_params[u"лицо"] = u"1"
                    if u"Ты" in key or u"Вы" in key:
                        dest_params[u"лицо"] = u"2"
                    if u"Он" in key:
                        dest_params[u"лицо"] = u"3"
                    if params.get(u"возвратный", ""):
                        dest_params[u"залог"] = u"возвр"
                    if key in [u"Я", u"Мы", u"Ты", u"Вы", u"Он/она/оно", u"Они"]:
                        if sv == u"н":
                            dest_params[u"время"] = u"наст"
                        elif sv == u"с":
                            dest_params[u"время"] = u"буд"
                        else:
                            pass  # todo: вопрос - а если там "2"?
                    dest_params[u"вид"] = sv  # todo: вопрос - а если там "2"?
                elif morph == u"сущ":
                    dest_params[u"число"] = u"ед"
                    special_cases = {u"П": u"притяжательного", u"Пр": u"превратительного", u"Сч": u"(счетная форма?)"}
                    if key in special_cases:
                        dest_params[u"падеж"] = special_cases[key]
                    else:
                        cases = {
                            "nom": u"именительного",
                            "gen": u"родительного",
                            "dat": u"дательного",
                            "acc": u"винительного",
                            "ins": u"творительного",
                            "prp": u"предложного",
                            "loc": u"местного",
                            "voc": u"звательного",
                            "prt": u"разделительного",
                        }
                        plurals = ["pl", "pl2"]
                        case, plural = key.split("-")
                        if plural in plurals:
                            dest_params[u"число"] = u"мн"
                        dest_params[u"падеж"] = cases[case]
                    if key == "nom-sg":
                        value = values[0]
                        if page.title != remove_stress(value):
                            pass  # ошибки!
                            # print
                            # print page.title
                            # print remove_stress(value)
                            # print repr(page.title)
                            # print repr(remove_stress(value))

                for value in values:
                    if remove_stress(value) == page.title:
                        continue
                    results.setdefault(value, list())
                    results[value].append(dest_params.copy())

            exceptions = {u"сущ": [u"падеж"], u"гл": [u"лицо"]}

            def print_items(items):
                print
                for value, items in results.items():
                    print "— value:", remove_stress(value)
                    for i in range(len(items)):
                        item = items[i]
                        if item:
                            print "—  ", i, item.get(u"лицо", "-")
                        else:
                            print "—  ", i, item
                print

            for value, items in results.items():
                # print 'value =', remove_stress(value)
                for i in range(len(items)):
                    #    print i, 'start'
                    #    print_items(items)
                    dest_params = items[i]
                    # print ' dest_params', i
                    # print '', get_text_from_dict(dest_params).replace('\n', ' ')
                    for j in range(len(items)):
                        if i == j:
                            continue
                        another_params = items[j]
                        if not another_params:
                            continue
                        # print '  another_params', i
                        # print ' ', get_text_from_dict(another_params).replace('\n', ' ')
                        found = True
                        for key in set(dest_params.keys() + another_params.keys()):
                            if key not in exceptions[morph]:
                                if dest_params.get(key) != another_params.get(key):
                                    found = False
                                    break
                        if found:
                            if morph == u"гл":
                                if u"лицо" in dest_params:
                                    another_params[u"лицо"] += dest_params[u"лицо"]
                            elif morph == u"сущ":
                                another_params[u"падеж"] += ", " + dest_params[u"падеж"]
                            items[i] = None
                            break
                    # print i, 'finish'
                    # print_items(items)

            for value, items in results.items():
                for dest_params in items:
                    if not dest_params:
                        continue
                    form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title)
                    if morph == u"гл":
                        if u"лицо" in dest_params:
                            v = dest_params[u"лицо"]
                            if len(v) == 3:
                                if "1" in v and "2" in v and "3" in v:
                                    dest_params[u"лицо"] = "123"
                                else:
                                    print "### BAD VALUE FOR u'лицо':", v
                            elif len(v) != 1:
                                print "### BAD VALUE FOR u'лицо':", v
                                print page.title
                                print remove_stress(value)
                                print form_template
                    elif morph == u"сущ":
                        pass  # todo: отсортировать падежи!
                    for param_name, param_value in dest_params.items():
                        form_template += "|%s=%s\n" % (param_name, param_value)
                    form_template += "}}"
Beispiel #2
0
    def tpl_action(self, page, tpl, title, morph, lang, params):
        title = title.strip()
        # empty_templates = [
        #     u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina",
        #     u"сущ ru m a", u"сущ ru f a", u"сущ ru n a",
        # ]
        # if title in empty_templates:
        #     print page.title
        #if morph != u'сущ':
        #    continue
        # print '-' * 80

        # output
        # self.counter += 1
        # if not self.counter % 50:
        #     save_wiki_page(u"Участник:Vitalik/Словоформы/v2/А1/%s" % self.n,
        #                    self.content, u"Получение списка словоформ")
        #     self.n += 1
        #     self.content = ''
        # # print "\n\n== [[%s]] ==" % page.title
        # self.content += "\n\n== [[%s]] ==\n" % page.title

        # print title
        # return
        #continue
        call_params, call_numeric = process_call_params(params)
        try:
            template = TemplateInflection.objects.get(title=title)
        except ObjectDoesNotExist:
            return  # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ")
        if not template.forms:
            print title, '- maybe wrong template'
            return
        print title
        tpl_forms = get_dict_from_text(template.forms)
        tpl_params = get_dict_from_text(template.params)
        for key, value in tpl_params.items():
            tpl_params[key] = universal_process_template(value, call_params)
        for key, value in tpl_forms.items():
            value = universal_process_template(value, call_params)
            value = process_template(value, tpl_params, key, morph)
            value = divide_words(value)
            tpl_forms[key] = value

        form_results = dict()

        for key, values in tpl_forms.items():
            if morph == u'сущ':
                if key == 'nom-sg':
                    value = values[0]
                    if remove_stress(value) and page.title != remove_stress(value):
                        pass  # todo: мсправить их все-таки
                        # print
                        # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8'))
                        # print page.title
                        # print remove_stress(value)
                        # print repr(page.title)
                        # print repr(remove_stress(value))

        for key, values in tpl_forms.items():
            # if re.search('[a-z]', ' '.join(values)):  #todo! find them!
            #     print values
            if not values or len(values) == 1 and not values[0]:
                continue
            form_params = get_form_params(morph, key, tpl_params)
            if not form_params:
                continue
            for value in values:
                if remove_stress(value) == page.title:
                    continue
                form_results.setdefault(value, list())
                form_results[value].append(form_params.copy())

        # todo: если полностью совпали, то тоже удалять
        join_form_results(morph, form_results)

        db_forms = list()
        for value, items in form_results.items():
            for form_params in items:
                if not form_params:
                    continue
                form_template = u"{{Форма-%s\n|язык=ru\n|база=%s\n" % (morph, page.title)
                for param_name, param_value in form_params.items():
                    form_template += "|%s=%s\n" % (param_name, param_value)
                form_template += u"|слоги={{по-слогам|%s}}\n" % value
                form_template += "}}"

                # output
                # # print "'''[[%s]]'''" % remove_stress(value)
                # # print form_template
                # self.content += "'''[[%s]]'''\n" % remove_stress(value)
                # self.content += "%s\n" % form_template
                # # print

                # db_form, created = WordForm.objects.get_or_create(
                # # if created:

                # WordForm.objects.create(
                #     title=remove_stress(value),
                #     base=page.title,
                #     value=value,
                #     template=form_template
                # )

                db_form = WordForm(
                    title=remove_stress(value),
                    base=page.title,
                    value=value,
                    template=form_template
                )
                db_forms.append(db_form)

                # db_counter += 1
                # if len(db_forms) > 1000:
                #     WordForm.objects.bulk_create(db_forms)
                #     print dt(), '> forms added:', db_counter
                #     db_forms = []
        WordForm.objects.bulk_create(db_forms)
Beispiel #3
0
def process_slovoforms_new():
    print "process_slovoforms()"
    i = 0
    db_forms = []
    db_counter = 0
    for page in Page.objects.iterate():
        i += 1
        if not i % 1000:
            print dt(), "processed pages:", i
        # try:
        #    content = PageContent.objects.get(page=page).content
        # except ObjectDoesNotExist:
        #    print u'× does not exist'
        #    continue
        content = page.content
        p = re.compile(
            u"""(\{\{
                (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+)  # заголовок
                (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)  # параметры
                \}\})""",
            flags=re.UNICODE + re.DOTALL + re.VERBOSE,
        )
        parts = p.findall(content)
        # if page.title != u'житься':
        #    continue
        # if page.title != u'плавни':
        #    continue
        # print page.title
        # for part in parts:
        #    print '==='
        #    for i in part:
        #        print i
        #        print '---'
        #    print '==='
        for part in parts:
            # print part[0]
            m = p.search(part[0])
            if m:
                title = m.group("title").strip()
                empty_templates = [
                    u"сущ ru m ina",
                    u"сущ ru f ina",
                    u"сущ ru n ina",
                    u"сущ ru m a",
                    u"сущ ru f a",
                    u"сущ ru n a",
                ]
                if title in empty_templates:
                    print page.title

                morph = m.group("morph")
                # if morph != u'сущ':
                #    continue
                # print page.title
                # continue
                call_params = process_call_params(m.group("params"))
                try:
                    template = TemplateInflectionData.objects.get(title=title)
                except ObjectDoesNotExist:
                    continue  # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ")
                tpl_forms = get_dict_from_text(template.forms)
                tpl_params = get_dict_from_text(template.params)
                for key, value in tpl_params.items():
                    tpl_params[key] = universal_process_template(value, call_params)
                for key, value in tpl_forms.items():
                    value = universal_process_template(value, call_params)
                    value = process_template(value, tpl_params, key, morph)
                    value = divide_words(value)
                    tpl_forms[key] = value

                form_results = dict()

                for key, values in tpl_forms.items():
                    if morph == u"сущ":
                        if key == "nom-sg":
                            value = values[0]
                            if remove_stress(value) and page.title != remove_stress(value):
                                pass  # todo: мсправить их все-таки
                                # print
                                # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8'))
                                # print page.title
                                # print remove_stress(value)
                                # print repr(page.title)
                                # print repr(remove_stress(value))

                for key, values in tpl_forms.items():
                    # if re.search('[a-z]', ' '.join(values)):  #todo! find them!
                    #     print values
                    if not values or len(values) == 1 and not values[0]:
                        continue
                    form_params = get_form_params(morph, key, tpl_params)
                    if not form_params:
                        continue
                    for value in values:
                        if remove_stress(value) == page.title:
                            continue
                        form_results.setdefault(value, list())
                        form_results[value].append(form_params.copy())

                # todo: если полностью совпали, то тоже удалять
                join_form_results(morph, form_results)

                for value, items in form_results.items():
                    for form_params in items:
                        if not form_params:
                            continue
                        form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title)
                        for param_name, param_value in form_params.items():
                            form_template += "|%s=%s\n" % (param_name, param_value)
                        form_template += "}}"
                        # print remove_stress(value)
                        # print form_template
                        # print
                        # db_form, created = WordForm.objects.get_or_create(
                        db_form = WordForm(
                            title=remove_stress(value), base=page.title, value=value, template=form_template
                        )
                        # if created:
                        db_forms.append(db_form)
                        db_counter += 1
                        if len(db_forms) > 1000:
                            WordForm.objects.bulk_create(db_forms)
                            print dt(), "> forms added:", db_counter
                            db_forms = []
    WordForm.objects.bulk_create(db_forms)