Exemple #1
0
def process_call_params(call_params):
    call_params = re.sub(u"\{\{по[- ]слогам[^}]+\}\}", "", call_params)
    call_params = call_params.split("|")
    call_params = map(lambda x: x.strip(), call_params)
    call_params = filter(len, call_params)
    call_params, call_numeric = get_dict_from_lines(call_params, get_numeric=True)
    return call_params, call_numeric
Exemple #2
0
def process_slovoforms_old():
    print "process_slovoforms()"
    i = 0
    keys = set()
    for page in Page.objects.iterate():
        i += 1
        # if i > 100:
        #    break
        if not i % 10000:
            print dt(), i
        try:
            content = PageContent.objects.get(page=page).content
        except ObjectDoesNotExist:
            print u"× does not exist"
            continue
        # morph = u'сущ'
        # m = re.search(u'(\{\{(?P<title>гл ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})', content, flags=re.UNICODE + re.DOTALL)
        # todo: этих блоков (шаблонов словоизменения) может быть несколько!!
        m = re.search(
            u"(\{\{(?P<title>(?P<morph>сущ|гл) ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})",
            content,
            flags=re.UNICODE + re.DOTALL,
        )
        # print page.pk, m
        if m:
            title = m.group("title").strip()
            morph = m.group("morph")
            source_params = m.group("params")
            source_params = re.sub(u"\{\{по[- ]слогам[^}]+\}\}", "", source_params)
            source_params = source_params.split("|")
            source_params = map(lambda x: x.strip(), source_params)
            source_params = filter(len, source_params)
            # print
            # print '=' * 20
            # print page.title
            # print title
            source_params = get_dict_from_lines(source_params)
            keys |= set(source_params.keys())
            # for key, value in source_params.items():
            #    print key, '=', value
            # print '-' * 20

            try:
                template = TemplateInflectionData.objects.get(title=title)
            except ObjectDoesNotExist:
                # print ('#' * 120 + '\n') * 20
                continue  # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ")
            forms = get_dict_from_text(template.forms)
            params = get_dict_from_text(template.params)
            # print template.forms
            # print '-' * 20
            # for key, value in forms.items():
            #     print key, '=', value
            # print '-' * 20

            for param, value in params.items():
                params[param] = process_template(value, source_params)
            # for key, value in params.items():
            #      print key, '=', value
            # print '-' * 20

            for form, value in forms.items():
                value = process_template(value, source_params)
                forms[form] = divide_words(value)
                # todo: буду — всегда инфинитив?
                # todo: не добавлять их в результаты!!
                future = [
                    u"буду/будешь… ",
                    u"буду/будешь... ",
                    u"буду, будешь, будем… ",
                    u"буду, будешь, будет… ",
                    u"буду, будешь, будет ",
                    u"будет ",
                    u"будет… ",
                    u"буду ",
                ]
                value = value.strip()
                for prefix in future:
                    if value[: len(prefix)] == prefix:
                        value = value[len(prefix) :]
                        if page.title != value.replace(u"́", ""):
                            pass
                            # print
                            # print page.title
                            # print value.replace(u'́', '')
                            # print repr(page.title)
                            # print repr(value.replace(u'́', ''))

            sv = ""
            if morph == u"гл":
                try:
                    sv = params[u"вид"][0]
                except KeyError:
                    continue  # todo: how it is possible?

            results = dict()

            for key, values in forms.items():
                # if re.search('[a-z]', ' '.join(values)):  #todo! find them!
                #     print values
                if not values or len(values) == 1 and not values[0]:
                    continue
                # print key, '=', ' | '.join(values).replace(u'́', '')

                dest_params = dict()
                if morph == u"гл":
                    # todo: if "Будущее" -> continue
                    # todo: 123 лицо
                    # todo: род по порядку следования (в прошлом времени)
                    if key in [u"Прич", u"ПричНаст", u"ПричПрош", u"ПричСтрад", u"ПричСтрадПрош"]:
                        # todo: время тоже учитывать?
                        dest_params[u"прич"] = 1
                    if key in [u"Деепр", u"ДеепрНаст", u"ДеепрПрош", u"ДеепрНастПрош"]:
                        # todo: время тоже учитывать?
                        dest_params[u"деепр"] = 1
                    if key in [u"Будущее"]:
                        pass
                    if u"(прош.)" in key:
                        dest_params[u"время"] = u"пр"
                    if u"(повел.)" in key:
                        dest_params[u"накл"] = u"п"
                    if u"Мы" in key or u"Вы" in key or u"Они" in key:
                        dest_params[u"число"] = u"мн"
                    if u"Я" in key or u"Ты" in key or u"она" in key:
                        dest_params[u"число"] = u"ед"
                    if u"Я" in key or u"Мы" in key:
                        dest_params[u"лицо"] = u"1"
                    if u"Ты" in key or u"Вы" in key:
                        dest_params[u"лицо"] = u"2"
                    if u"Он" in key:
                        dest_params[u"лицо"] = u"3"
                    if params.get(u"возвратный", ""):
                        dest_params[u"залог"] = u"возвр"
                    if key in [u"Я", u"Мы", u"Ты", u"Вы", u"Он/она/оно", u"Они"]:
                        if sv == u"н":
                            dest_params[u"время"] = u"наст"
                        elif sv == u"с":
                            dest_params[u"время"] = u"буд"
                        else:
                            pass  # todo: вопрос - а если там "2"?
                    dest_params[u"вид"] = sv  # todo: вопрос - а если там "2"?
                elif morph == u"сущ":
                    dest_params[u"число"] = u"ед"
                    special_cases = {u"П": u"притяжательного", u"Пр": u"превратительного", u"Сч": u"(счетная форма?)"}
                    if key in special_cases:
                        dest_params[u"падеж"] = special_cases[key]
                    else:
                        cases = {
                            "nom": u"именительного",
                            "gen": u"родительного",
                            "dat": u"дательного",
                            "acc": u"винительного",
                            "ins": u"творительного",
                            "prp": u"предложного",
                            "loc": u"местного",
                            "voc": u"звательного",
                            "prt": u"разделительного",
                        }
                        plurals = ["pl", "pl2"]
                        case, plural = key.split("-")
                        if plural in plurals:
                            dest_params[u"число"] = u"мн"
                        dest_params[u"падеж"] = cases[case]
                    if key == "nom-sg":
                        value = values[0]
                        if page.title != remove_stress(value):
                            pass  # ошибки!
                            # print
                            # print page.title
                            # print remove_stress(value)
                            # print repr(page.title)
                            # print repr(remove_stress(value))

                for value in values:
                    if remove_stress(value) == page.title:
                        continue
                    results.setdefault(value, list())
                    results[value].append(dest_params.copy())

            exceptions = {u"сущ": [u"падеж"], u"гл": [u"лицо"]}

            def print_items(items):
                print
                for value, items in results.items():
                    print "— value:", remove_stress(value)
                    for i in range(len(items)):
                        item = items[i]
                        if item:
                            print "—  ", i, item.get(u"лицо", "-")
                        else:
                            print "—  ", i, item
                print

            for value, items in results.items():
                # print 'value =', remove_stress(value)
                for i in range(len(items)):
                    #    print i, 'start'
                    #    print_items(items)
                    dest_params = items[i]
                    # print ' dest_params', i
                    # print '', get_text_from_dict(dest_params).replace('\n', ' ')
                    for j in range(len(items)):
                        if i == j:
                            continue
                        another_params = items[j]
                        if not another_params:
                            continue
                        # print '  another_params', i
                        # print ' ', get_text_from_dict(another_params).replace('\n', ' ')
                        found = True
                        for key in set(dest_params.keys() + another_params.keys()):
                            if key not in exceptions[morph]:
                                if dest_params.get(key) != another_params.get(key):
                                    found = False
                                    break
                        if found:
                            if morph == u"гл":
                                if u"лицо" in dest_params:
                                    another_params[u"лицо"] += dest_params[u"лицо"]
                            elif morph == u"сущ":
                                another_params[u"падеж"] += ", " + dest_params[u"падеж"]
                            items[i] = None
                            break
                    # print i, 'finish'
                    # print_items(items)

            for value, items in results.items():
                for dest_params in items:
                    if not dest_params:
                        continue
                    form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title)
                    if morph == u"гл":
                        if u"лицо" in dest_params:
                            v = dest_params[u"лицо"]
                            if len(v) == 3:
                                if "1" in v and "2" in v and "3" in v:
                                    dest_params[u"лицо"] = "123"
                                else:
                                    print "### BAD VALUE FOR u'лицо':", v
                            elif len(v) != 1:
                                print "### BAD VALUE FOR u'лицо':", v
                                print page.title
                                print remove_stress(value)
                                print form_template
                    elif morph == u"сущ":
                        pass  # todo: отсортировать падежи!
                    for param_name, param_value in dest_params.items():
                        form_template += "|%s=%s\n" % (param_name, param_value)
                    form_template += "}}"