def tpl_action(self, page, tpl, title, morph, lang, params): title = title.strip() # empty_templates = [ # u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina", # u"сущ ru m a", u"сущ ru f a", u"сущ ru n a", # ] # if title in empty_templates: # print page.title #if morph != u'сущ': # continue # print '-' * 80 # output # self.counter += 1 # if not self.counter % 50: # save_wiki_page(u"Участник:Vitalik/Словоформы/v2/А1/%s" % self.n, # self.content, u"Получение списка словоформ") # self.n += 1 # self.content = '' # # print "\n\n== [[%s]] ==" % page.title # self.content += "\n\n== [[%s]] ==\n" % page.title # print title # return #continue call_params, call_numeric = process_call_params(params) try: template = TemplateInflection.objects.get(title=title) except ObjectDoesNotExist: return # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ") if not template.forms: print title, '- maybe wrong template' return print title tpl_forms = get_dict_from_text(template.forms) tpl_params = get_dict_from_text(template.params) for key, value in tpl_params.items(): tpl_params[key] = universal_process_template(value, call_params) for key, value in tpl_forms.items(): value = universal_process_template(value, call_params) value = process_template(value, tpl_params, key, morph) value = divide_words(value) tpl_forms[key] = value form_results = dict() for key, values in tpl_forms.items(): if morph == u'сущ': if key == 'nom-sg': value = values[0] if remove_stress(value) and page.title != remove_stress(value): pass # todo: мсправить их все-таки # print # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8')) # print page.title # print remove_stress(value) # print repr(page.title) # print repr(remove_stress(value)) for key, values in tpl_forms.items(): # if re.search('[a-z]', ' '.join(values)): #todo! find them! # print values if not values or len(values) == 1 and not values[0]: continue form_params = get_form_params(morph, key, tpl_params) if not form_params: continue for value in values: if remove_stress(value) == page.title: continue form_results.setdefault(value, list()) form_results[value].append(form_params.copy()) # todo: если полностью совпали, то тоже удалять join_form_results(morph, form_results) db_forms = list() for value, items in form_results.items(): for form_params in items: if not form_params: continue form_template = u"{{Форма-%s\n|язык=ru\n|база=%s\n" % (morph, page.title) for param_name, param_value in form_params.items(): form_template += "|%s=%s\n" % (param_name, param_value) form_template += u"|слоги={{по-слогам|%s}}\n" % value form_template += "}}" # output # # print "'''[[%s]]'''" % remove_stress(value) # # print form_template # self.content += "'''[[%s]]'''\n" % remove_stress(value) # self.content += "%s\n" % form_template # # print # db_form, created = WordForm.objects.get_or_create( # # if created: # WordForm.objects.create( # title=remove_stress(value), # base=page.title, # value=value, # template=form_template # ) db_form = WordForm( title=remove_stress(value), base=page.title, value=value, template=form_template ) db_forms.append(db_form) # db_counter += 1 # if len(db_forms) > 1000: # WordForm.objects.bulk_create(db_forms) # print dt(), '> forms added:', db_counter # db_forms = [] WordForm.objects.bulk_create(db_forms)
def process_slovoforms_new(): print "process_slovoforms()" i = 0 db_forms = [] db_counter = 0 for page in Page.objects.iterate(): i += 1 if not i % 1000: print dt(), "processed pages:", i # try: # content = PageContent.objects.get(page=page).content # except ObjectDoesNotExist: # print u'× does not exist' # continue content = page.content p = re.compile( u"""(\{\{ (?P<title>(?P<morph>сущ|гл)[ ]ru[ ][^|]+) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE, ) parts = p.findall(content) # if page.title != u'житься': # continue # if page.title != u'плавни': # continue # print page.title # for part in parts: # print '===' # for i in part: # print i # print '---' # print '===' for part in parts: # print part[0] m = p.search(part[0]) if m: title = m.group("title").strip() empty_templates = [ u"сущ ru m ina", u"сущ ru f ina", u"сущ ru n ina", u"сущ ru m a", u"сущ ru f a", u"сущ ru n a", ] if title in empty_templates: print page.title morph = m.group("morph") # if morph != u'сущ': # continue # print page.title # continue call_params = process_call_params(m.group("params")) try: template = TemplateInflectionData.objects.get(title=title) except ObjectDoesNotExist: continue # todo: process templates redirects ("гл ru 4b-бСВ" → "гл ru 4b-лСВ") tpl_forms = get_dict_from_text(template.forms) tpl_params = get_dict_from_text(template.params) for key, value in tpl_params.items(): tpl_params[key] = universal_process_template(value, call_params) for key, value in tpl_forms.items(): value = universal_process_template(value, call_params) value = process_template(value, tpl_params, key, morph) value = divide_words(value) tpl_forms[key] = value form_results = dict() for key, values in tpl_forms.items(): if morph == u"сущ": if key == "nom-sg": value = values[0] if remove_stress(value) and page.title != remove_stress(value): pass # todo: мсправить их все-таки # print # print "https://ru.wiktionary.org/wiki/%s" % urllib.quote_plus(page.title.encode('utf-8')) # print page.title # print remove_stress(value) # print repr(page.title) # print repr(remove_stress(value)) for key, values in tpl_forms.items(): # if re.search('[a-z]', ' '.join(values)): #todo! find them! # print values if not values or len(values) == 1 and not values[0]: continue form_params = get_form_params(morph, key, tpl_params) if not form_params: continue for value in values: if remove_stress(value) == page.title: continue form_results.setdefault(value, list()) form_results[value].append(form_params.copy()) # todo: если полностью совпали, то тоже удалять join_form_results(morph, form_results) for value, items in form_results.items(): for form_params in items: if not form_params: continue form_template = u"{{Форма-%s\n|база=%s\n" % (morph, page.title) for param_name, param_value in form_params.items(): form_template += "|%s=%s\n" % (param_name, param_value) form_template += "}}" # print remove_stress(value) # print form_template # print # db_form, created = WordForm.objects.get_or_create( db_form = WordForm( title=remove_stress(value), base=page.title, value=value, template=form_template ) # if created: db_forms.append(db_form) db_counter += 1 if len(db_forms) > 1000: WordForm.objects.bulk_create(db_forms) print dt(), "> forms added:", db_counter db_forms = [] WordForm.objects.bulk_create(db_forms)