def process_call_params(call_params): call_params = re.sub(u"\{\{по[- ]слогам[^}]+\}\}", "", call_params) call_params = call_params.split("|") call_params = map(lambda x: x.strip(), call_params) call_params = filter(len, call_params) call_params, call_numeric = get_dict_from_lines(call_params, get_numeric=True) return call_params, call_numeric
def process_slovoforms_old(): print "process_slovoforms()" i = 0 keys = set() for page in Page.objects.iterate(): i += 1 # if i > 100: # break if not i % 10000: print dt(), i try: content = PageContent.objects.get(page=page).content except ObjectDoesNotExist: print u"× does not exist" continue # morph = u'сущ' # m = re.search(u'(\{\{(?P<title>гл ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})', content, flags=re.UNICODE + re.DOTALL) # todo: этих блоков (шаблонов словоизменения) может быть несколько!! m = re.search( u"(\{\{(?P<title>(?P<morph>сущ|гл) ru [^|]+)(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?)\}\})", content, flags=re.UNICODE + re.DOTALL, ) # print page.pk, m if m: title = m.group("title").strip() morph = m.group("morph") source_params = m.group("params") source_params = re.sub(u"\{\{по[- ]слогам[^}]+\}\}", "", source_params) source_params = source_params.split("|") source_params = map(lambda x: x.strip(), source_params) source_params = filter(len, source_params) # print # print '=' * 20 # print page.title # print title source_params = get_dict_from_lines(source_params) keys |= set(source_params.keys()) # for key, value in source_params.items(): # print key, '=', value # print '-' * 20 try: template = TemplateInflectionData.objects.get(title=title) except ObjectDoesNotExist: # print ('#' * 120 + '\n') * 20 continue # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ") forms = get_dict_from_text(template.forms) params = get_dict_from_text(template.params) # print template.forms # print '-' * 20 # for key, value in forms.items(): # print key, '=', value # print '-' * 20 for param, value in params.items(): params[param] = process_template(value, source_params) # for key, value in params.items(): # print key, '=', value # print '-' * 20 for form, value in forms.items(): value = process_template(value, source_params) forms[form] = divide_words(value) # todo: буду — всегда инфинитив? # todo: не добавлять их в результаты!! future = [ u"буду/будешь… ", u"буду/будешь... ", u"буду, будешь, будем… ", u"буду, будешь, будет… ", u"буду, будешь, будет ", u"будет ", u"будет… ", u"буду ", ] value = value.strip() for prefix in future: if value[: len(prefix)] == prefix: value = value[len(prefix) :] if page.title != value.replace(u"́", ""): pass # print # print page.title # print value.replace(u'́', '') # print repr(page.title) # print repr(value.replace(u'́', '')) sv = "" if morph == u"гл": try: sv = params[u"вид"][0] except KeyError: continue # todo: how it is possible? results = dict() for key, values in forms.items(): # if re.search('[a-z]', ' '.join(values)): #todo! find them! # print values if not values or len(values) == 1 and not values[0]: continue # print key, '=', ' | '.join(values).replace(u'́', '') dest_params = dict() if morph == u"гл": # todo: if "Будущее" -> continue # todo: 123 лицо # todo: род по порядку следования (в прошлом времени) if key in [u"Прич", u"ПричНаст", u"ПричПрош", u"ПричСтрад", u"ПричСтрадПрош"]: # todo: время тоже учитывать? dest_params[u"прич"] = 1 if key in [u"Деепр", u"ДеепрНаст", u"ДеепрПрош", u"ДеепрНастПрош"]: # todo: время тоже учитывать? dest_params[u"деепр"] = 1 if key in [u"Будущее"]: pass if u"(прош.)" in key: dest_params[u"время"] = u"пр" if u"(повел.)" in key: dest_params[u"накл"] = u"п" if u"Мы" in key or u"Вы" in key or u"Они" in key: dest_params[u"число"] = u"мн" if u"Я" in key or u"Ты" in key or u"она" in key: dest_params[u"число"] = u"ед" if u"Я" in key or u"Мы" in key: dest_params[u"лицо"] = u"1" if u"Ты" in key or u"Вы" in key: dest_params[u"лицо"] = u"2" if u"Он" in key: dest_params[u"лицо"] = u"3" if params.get(u"возвратный", ""): dest_params[u"залог"] = u"возвр" if key in [u"Я", u"Мы", u"Ты", u"Вы", u"Он/она/оно", u"Они"]: if sv == u"н": dest_params[u"время"] = u"наст" elif sv == u"с": dest_params[u"время"] = u"буд" else: pass # todo: вопрос - а если там "2"? dest_params[u"вид"] = sv # todo: вопрос - а если там "2"? elif morph == u"сущ": dest_params[u"число"] = u"ед" special_cases = {u"П": u"притяжательного", u"Пр": u"превратительного", u"Сч": u"(счетная форма?)"} if key in special_cases: dest_params[u"падеж"] = special_cases[key] else: cases = { "nom": u"именительного", "gen": u"родительного", "dat": u"дательного", "acc": u"винительного", "ins": u"творительного", "prp": u"предложного", "loc": u"местного", "voc": u"звательного", "prt": u"разделительного", } plurals = ["pl", "pl2"] case, plural = key.split("-") if plural in plurals: dest_params[u"число"] = u"мн" dest_params[u"падеж"] = cases[case] if key == "nom-sg": value = values[0] if page.title != remove_stress(value): pass # ошибки! # print # print page.title # print remove_stress(value) # print repr(page.title) # print repr(remove_stress(value)) for value in values: if remove_stress(value) == page.title: continue results.setdefault(value, list()) results[value].append(dest_params.copy()) exceptions = {u"сущ": [u"падеж"], u"гл": [u"лицо"]} def print_items(items): print for value, items in results.items(): print "— value:", remove_stress(value) for i in range(len(items)): item = items[i] if item: print "— ", i, item.get(u"лицо", "-") else: print "— ", i, item print for value, items in results.items(): # print 'value =', remove_stress(value) for i in range(len(items)): # print i, 'start' # print_items(items) dest_params = items[i] # print ' dest_params', i # print '', get_text_from_dict(dest_params).replace('\n', ' ') for j in range(len(items)): if i == j: continue another_params = items[j] if not another_params: continue # print ' another_params', i # print ' ', get_text_from_dict(another_params).replace('\n', ' ') found = True for key in set(dest_params.keys() + another_params.keys()): if key not in exceptions[morph]: if dest_params.get(key) != another_params.get(key): found = False break if found: if morph == u"гл": if u"лицо" in dest_params: another_params[u"лицо"] += dest_params[u"лицо"] elif morph == u"сущ": another_params[u"падеж"] += ", " + dest_params[u"падеж"] items[i] = None break # print i, 'finish' # print_items(items) for value, items in results.items(): for dest_params in items: if not dest_params: continue form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title) if morph == u"гл": if u"лицо" in dest_params: v = dest_params[u"лицо"] if len(v) == 3: if "1" in v and "2" in v and "3" in v: dest_params[u"лицо"] = "123" else: print "### BAD VALUE FOR u'лицо':", v elif len(v) != 1: print "### BAD VALUE FOR u'лицо':", v print page.title print remove_stress(value) print form_template elif morph == u"сущ": pass # todo: отсортировать падежи! for param_name, param_value in dest_params.items(): form_template += "|%s=%s\n" % (param_name, param_value) form_template += "}}"