def process(self): i = 0 for chunk in chunks(self.words, self.chunk_len): i += self.chunk_len if not i % 10000: print i self._process_chunk(chunk)
def bulk(self, items, model=None, chunk_size=1000): if not model: model = self.model processed = 0 for chunk in chunks(items, chunk_size): processed += len(model.objects.bulk_create(chunk)) print dt(), '-> Processed:', processed
def prepare_langs(self): parts = P.lang_header.split(self.content) self.langs.append('') self.lang_contents[''] = parts.pop(0) for part in chunks(parts, 2): lang = part[0] self.langs.append(lang) self.lang_contents[lang] = part[1]
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content new_content = content for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if not headers: continue if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers: # print u'# [[{}]] (секция "{}")'.format(title, lang) # print '=' * 120 # print section_content if section_content.strip().startswith(u'<b>'): new_section_content = \ re.sub(u'^\s*<b>', u'\n=== Морфологические и синтаксические свойства ===\n<b>', section_content) new_content = new_content.replace(section_content, new_section_content) elif section_content.strip().startswith(u'{{падежи '): new_section_content = \ re.sub(u'^\s*\{\{падежи ', u'\n=== Морфологические и синтаксические свойства ===\n{{падежи ', section_content) new_content = new_content.replace(section_content, new_section_content) return new_content
def update_words_details(self): words = self.words.values_list('word', flat=True) field_name = self.details_field_name() kwargs = {field_name: 1} print len(words), 'by', 1000 i = 0 for chunk in chunks(words, 1000): i += 1 print i Word_Stats.objects.filter(word__in=chunk).update(**kwargs)
def prepare_sections(self): for lang, lang_content in self.lang_contents.items(): contents = self.sections[lang] = dict() order = self.second_headers[lang] = list() parts = P.second_header.split(lang_content) order.append('') contents[''] = parts.pop(0) for part in chunks(parts, 2): second_header = part[0] order.append(second_header) contents[second_header] = part[1]
def parse_block_content(content): parts = re.split('\n=== ([^=]*) ===\n', content) content = parts[0] sub_data = dict() for part in chunks(parts[1:], 2): sub_title = part[0] sub_content = part[1] if sub_title in sub_data: print '×' * 20, 'Subtitle duplicate: %s' % sub_title sub_data[sub_title] = sub_content return content, sub_data
def content_action(self, page, content, redirect, **kwargs): super(BaseLanguageIterator, self).content_action(page, content, redirect, **kwargs) parts = P.lang_header.split(content) sections = [ {'lang': '', 'content': parts.pop(0)} ] sections += [ {'lang': part[0], 'content': part[1]} for part in chunks(parts, 2) ] self.langs_action(page, sections)
def lang_action(self, page, lang, content): super(BasePageSubSectionIterator, self).lang_action(page, lang, content) parts = re.split("^((?:==)(?:.*)(?:==) *)$", content, flags=re.MULTILINE) sub_sections = [ {'sub_header': '', 'content': parts.pop(0)} ] sub_sections += [ {'sub_header': part[0], 'content': part[1]} for part in chunks(parts, 2) ] return self.sub_sections_action(page, lang, sub_sections, content)
def process(self): self.dictionary.words_count = len(self.words) self.dictionary.save() # return # for chunk_len in [2000, 1000, 500, 200, 100]: # try: # chunk_len = 200 self.i = 0 self.n = len(self.words) / self.chunk_len for chunk in chunks(self.words, self.chunk_len): self.i += 1 self._process_chunk(chunk) return
def lang_action(self, page, lang, content): super(BasePageSectionIterator, self).lang_action(page, lang, content) parts = P.second_header.split(content) sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] # check if all headers are correct if not self.check_if_all_headera_are_correct(sections): return self.sections_action(page, lang, sections, content)
def prepare_sub_sections(self): for lang, lang_data in self.sections.items(): self.sub_headers[lang] = dict() self.sub_sections[lang] = dict() for header2, content2 in lang_data.items(): contents = self.sub_sections[lang][header2] = dict() order = self.sub_headers[lang][header2] = list() parts = re.split("^((?:==)(?:.*)(?:==) *)$", content2, flags=re.MULTILINE) order.append('') contents[''] = parts.pop(0) for part in chunks(parts, 2): sub_header = part[0] m = re.match(u"^(=+)([^=]+)(=+) *$", sub_header) sub_header = u"%s %s %s" % \ (m.group(1), m.group(2).strip(), m.group(3)) order.append(sub_header) contents[sub_header] = part[1]
def parse_mass_edit(content): parts = re.split('\n== ([^=]*) ==\n', content) blocks = dict() sub_blocks = dict() for part in chunks(parts[1:], 2): title = part[0].strip() content = part[1] content, sub_data = parse_block_content(content) m = re.match(u'^\[\[([^]]+)\]\]$', title) if not m: print '×' * 20, 'Wrong title: need [[...]], found: %s' % title continue title = m.group(1) if title in blocks: print '×' * 20, 'Title duplicate: %s' % title continue blocks[title] = content sub_blocks[title] = sub_data # print '-' * 100 # print title # print '-' * 100 # print content return blocks, sub_blocks
def get_unknown_inflection_full(): items = WordInflection.objects.filter(kind__in=[u'f ?', u'm ?', u'n ?']) words = sorted(items, key=lambda item: item.word[::-1]) report = dict() for word in words: print word.word valid_num = check_correct_inflection(word.word, word.gender) if valid_num < 0: continue key = "%s%d" % (word.gender, valid_num) report.setdefault(key, list()) report[key].append((word.word, word.content)) file_path = join(PAGES_DIR, u'Массовое редактирование') wiki_prefix = u"Участник:Vitalik/Массовое редактирование/Словоизменение/сущ" desc = u"Полное обновление данных" for key in report.keys(): print key items = report.get(key) i = 1 page_count = len(items) / 100 + 1 for chunk in chunks(items, 100): content = gen_report_full(chunk, key[0], key[1], page_count) filename = "%s_%d.txt" % (key, i) print filename debug_write(file_path, filename, content) # sys.exit() wiki_title = "%s/%s/%s" % (wiki_prefix, key, i) changed = save_wiki_page(wiki_title, content, desc) if changed: db_title = u'сущ/%s/%s' % (key, i) item, created = WordInflectionMassEdit.objects.get_or_create( title=db_title) item.content = content item.save() i += 1
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass # elif headers == template: # # print 'OK' # pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True # if absent_semantic_headers or wrong_order_error: # # print u'{} #{} {}'.format(title, lang, data['header2']) # # print '\n'.join(headers) # # if absent: # # print "\n".join([u"{} -> ABSENT WARNING".format(header) # # for header in absent]) # # if wrong_order_error: # # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # # if unknown_headers: # # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # # for header in unknown_headers]) # # print # pass # elif unknown_headers: # pass # else: if title in [u'օժանդակ բայ']: return content if True: m = re.search(u'(==== *Значение *==== *(.*?)' u'\n)=', # u'\n)===', section_content, re.UNICODE | re.DOTALL) if not m: continue # raise Exception(u'title={}, lang={}'.format(title, lang)) semantic_section = m.group(1) new_semantic_section = semantic_section mining = m.group(2) # if mining.strip() == u'[[]]\n{{Нужен перевод}}': # print section_content # print '=' * 120 has_strange = False for line in mining.split('\n'): line = line.strip() if not line.strip(): continue if re.match('^#', line): continue if "''" in line: new_line = \ re.sub(u"''(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)\.''", u'{{{{\\1.|{}}}}}'.format(lang), line) if line != new_line: if "''" not in new_line: new_line = u'# ' + new_line print line print new_line print new_semantic_section = \ new_semantic_section.replace(line, new_line) if "{" in line or "''" in line or "<i>" in line or line.startswith("|"): continue # if re.match('^\*', line): # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n#{}\n'.format(line[1:]) # ) if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line, re.UNICODE): continue # if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line, # re.UNICODE): if re.match(u'^\[\[\]\]$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue if re.match(u'^\[\[(Файл:|File:|Image:|Изображение:)[^]]+\]\]$', line, re.UNICODE): # todo: [[Файл: и прочие IMG # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue ok = False # if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, # re.UNICODE): # ok = True if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( *\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True # if not re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE) \ # and re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) if ok: # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n# {}\n'.format(line) # ) continue has_strange = True if lang == 'la': # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "''" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "<i>" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif line.startswith(u'}} {{пример') \ or line.startswith(u'{{списки семантических связей')\ or line.startswith(u'|'): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "{" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "[" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif u'Аналогично русскому' in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass else: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue # if not has_strange: # new_content = new_content.replace(semantic_section, # new_semantic_section) new_content = new_content.replace(semantic_section, new_semantic_section) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: header2 = data['header2'] if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: print header2, '==', '#' * 120 return content # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) c = 0 for header in headers: if header == u'==== Значение ====': c += 1 if c > 1: print title, '$' * 200 return content # raise Exception('c > 1') if u'=== Семантические свойства ===' not in headers \ and u'==== Значение ====' in headers: # print title, '/', lang # print '\n'.join(headers) page_content = page.content lst = re.findall(u'==== Значение ====', page_content) # if len(lst) == 1: # print title if len(lst) > 1: new_section_content = section_content.replace( u'\n==== Значение ====\n', u'\n=== Семантические свойства ===\n\n==== Значение ====\n', ) new_content = new_content.replace(section_content, new_section_content) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass # elif headers == template: # # print 'OK' # pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True # if absent_semantic_headers or wrong_order_error: # # print u'{} #{} {}'.format(title, lang, data['header2']) # # print '\n'.join(headers) # # if absent: # # print "\n".join([u"{} -> ABSENT WARNING".format(header) # # for header in absent]) # # if wrong_order_error: # # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # # if unknown_headers: # # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # # for header in unknown_headers]) # # print # pass # elif unknown_headers: # pass # else: if True: m = re.search(u'(==== *Значение *==== *(.*?)' u'\n)===', section_content, re.UNICODE | re.DOTALL) if not m: # print '#' * 100 # print u'title={}, lang={}'.format(title, lang) # print '#' * 100 continue # raise Exception(u'title={}, lang={}'.format(title, lang)) semantic_section = m.group(1) new_semantic_section = semantic_section mining = m.group(2) # if mining.strip() == u'[[]]\n{{Нужен перевод}}': # print section_content # print '=' * 120 has_strange = False for line in mining.split('\n'): line = line.strip() # items = re.findall('\{\{[^}]+\}\}', line) # for item in items: # if not item.startswith(u'{{пример|'): # print item # items = re.findall(u'\{\{помета\|[^}]+\}\}', line) # items = re.findall(u'\{\{помета\|[^}|]*\|[^}]*\}\}', line) # for item in items: # print item # items = re.findall(u'\{\{спорт.\|[^}]*вид[^}]*\}\}', line) # for item in items: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, item) # items = re.findall(u'\{\{субстантивир\.\|[^}]*\|[^}]*\}\}', line) # for item in items: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, item) # items = re.findall(u'\{\{ласк\..*\}\}', line) # for item in items: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, item) if line.startswith('#'): print line if not line.strip(): continue if re.match('^#', line): continue if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line, re.UNICODE): continue if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line, re.UNICODE): # if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line, # re.UNICODE): # if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line, # re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue # todo: [[Файл: и прочие IMG if re.match(u'^\[\[\]\]$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line, re.UNICODE): continue ok = False if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if ok: # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n# {}\n'.format(line) # ) continue has_strange = True if lang == 'la': # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "''" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "<i>" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "{" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass elif "[" in line: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass else: # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) pass if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue # if not has_strange: # new_content = new_content.replace(semantic_section, # new_semantic_section) return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass else: m = re.search(u'==== *Значение *==== *\n(.*?)' u'==== *Синонимы *==== *\n(.*?)' u'==== *Антонимы *==== *\n(.*?)' u'==== *Гиперонимы *==== *\n(.*?)' u'==== *Гипонимы *==== *(.*?)' u'\n===[^=]', section_content, re.UNICODE | re.DOTALL) if not m: # print title, '|', lang, '=' * 40 # print section_content # print '-' * 80 if lang == 'ru': if ' ' in title: tail_contains = "\n".join([ template_contents[u'Этимология/phrase'], template_contents[u'Перевод'], template_contents[u'Библиография'], ]) else: tail_contains = "\n".join([ template_contents[u'Родственные слова'], template_contents[u'Этимология/ru'], template_contents[u'Фразеологизмы'], template_contents[u'Перевод'], template_contents[u'Библиография'], ]) else: if ' ' in title: tail_contains = "\n".join([ template_contents[u'Этимология/phrase'], template_contents[u'Библиография'], ]) else: tail_contains = "\n".join([ template_contents[u'Родственные слова'], template_contents[u'Этимология/xx'].format(lang), template_contents[u'Фразеологизмы'], template_contents[u'Библиография'], ]) p = re.compile(u'(==== *Гипонимы *====\n[^[{]*)') m2 = re.search(u'(==== *Гипонимы *====(.*))', section_content, flags=re.DOTALL | re.UNICODE) if m2: if '# [' in m2.group(1): print title, '%' * 200 print m2.group(1) continue else: print title, '!' * 100 new_section_content = p.sub('\\1' + '\n' + tail_contains + '\n', section_content) new_section_content = new_section_content.replace('\n\n\n', '\n\n') new_content = new_content.replace(section_content, new_section_content) # print new_content # print '-' * 120 # print '\n'.join(headers) # print '-' * 120 # print section_content # print '-' * 120 # print # if has_unknown_header or wrong_order: # return content # if u'=== Морфологические и синтаксические свойства ===' in absent: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # if absent and headers and absent[0] == headers[0]: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # print '-' * 40 return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith("-") or title.endswith("-"): return content if title.startswith("*"): return content new_content = content # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [{"header2": "", "content": parts.pop(0)}] sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)] for data in sections: # print '-' * 80 header2 = data["header2"] # print header2 if not header2: continue p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == "ru": if " " in title: template = templates["ru"]["phrase"] else: template = templates["ru"]["word"] else: if " " in title: template = templates["xx"]["phrase"] else: template = templates["xx"]["word"] for data in sections: section_content = data["content"] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass elif unknown_headers: pass else: m = re.search( u"(==== *Значение *==== *\n(.*?)" u"==== *Синонимы *==== *\n(.*?)" u"==== *Антонимы *==== *\n(.*?)" u"==== *Гиперонимы *==== *\n(.*?)" u"==== *Гипонимы *==== *(.*?)" u"\n)===[^=]", section_content, re.UNICODE | re.DOTALL, ) if not m: print "#" * 200 print u"title={}, lang={}".format(title, lang) print "#" * 200 # continue raise Exception(u"title={}, lang={}".format(title, lang)) # print title, '|', lang semantic_section = m.group(1) new_semantic_section = semantic_section # print '=' * 40 # print semantic_section # print '-' * 40 mining = m.group(2) for line in mining.split("\n"): line = line.strip() if not line.strip(): continue if re.match("^#", line): continue if re.match(u"^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$", line, re.UNICODE): continue if re.match(u"^\{\{(длина слова|илл\.?)\|[^}]+\}\}$", line, re.UNICODE): print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line) continue if re.match(u"^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$", line, re.UNICODE): continue if re.match(u"^\[\[\]\]$", line, re.UNICODE): continue if re.match(u"^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE): continue if re.match( u"^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE ): continue # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) # print line if re.match(u"^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE): continue if re.match( u"^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE, ): continue bodies = [m.group(3), m.group(4), m.group(5), m.group(6)] for body in bodies: lines = body.split("\n") fake = False for line in lines: if not line.strip(): continue if re.match("^#", line): continue # print title, '|', lang, ' -> ', line if re.match("^[*:]", line): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n#{}\n".format(line[1:]) ) elif not line.startswith("<!--"): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n# {}\n".format(line) ) if re.match("^[*:]", line): continue if line in ["-", "?"]: continue if re.match("^\[\[[^]]+\]\]$", line): continue if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line): continue if re.match(u"^[a-zа-я !]+$", line, re.UNICODE | re.IGNORECASE): pass if re.match(u"^([a-zа-я !]+([,;] )?)+$", line, re.UNICODE | re.IGNORECASE): pass fake = True # print title, '|', lang # print '->', line # print line break return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith("-") or title.endswith("-"): return content if title.startswith("*"): return content new_content = content # print title, lang # if title == u'высокопарность': # print 'ok' # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [{"header2": "", "content": parts.pop(0)}] sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)] for data in sections: # print '-' * 80 header2 = data["header2"] # print header2 if not header2: continue p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == "ru": if " " in title: template = templates["ru"]["phrase"] else: template = templates["ru"]["word"] else: if " " in title: template = templates["xx"]["phrase"] else: template = templates["xx"]["word"] for data in sections: section_content = data["content"] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass elif unknown_headers: pass else: m = re.search( u"(==== *Значение *==== *\n(.*?)" u"==== *Синонимы *==== *\n(.*?)" u"==== *Антонимы *==== *\n(.*?)" u"==== *Гиперонимы *==== *\n(.*?)" u"==== *Гипонимы *==== *(.*?)" u"\n)===[^=]", section_content, re.UNICODE | re.DOTALL, ) if not m: # print title, '|', lang, '=' * 120 # print '\n'.join(headers) # print # raise Exception('!!!') continue # print title, '|', lang semantic_section = m.group(1) new_semantic_section = semantic_section # print '=' * 40 # print semantic_section # print '-' * 40 bodies = [m.group(3), m.group(4), m.group(5), m.group(6)] for body in bodies: # print body # body = body.strip() lines = body.split("\n") fake = False for line in lines: if not line.strip(): continue if re.match("^#", line): continue # print title, '|', lang, u' -> "{}"'.format(line) if re.match("^[*:]", line): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n#{}\n".format(line[1:]) ) elif not line.startswith("<!--"): new_semantic_section = new_semantic_section.replace( u"\n{}\n".format(line), u"\n# {}\n".format(line) ) if re.match("^[*:]", line): continue if line in ["-", "?"]: continue if re.match("^\[\[[^]]+\]\]$", line): continue if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line): continue # if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE): # pass # if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE): # pass fake = True print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line) # print title, '|', lang # print '->', line # print line break # if not fake: # or True: # if semantic_section != new_semantic_section: # print title, '|', lang # print '=' * 100 # print semantic_section # print '-' * 100 # print new_semantic_section # print '-' * 100 # print # new_content = \ # new_content.replace(semantic_section, # new_semantic_section) # if fake and body: # print title, '|', lang # print '=' * 120 # # print '"{}"'.format(tail) # print body # print '-' * 120 # print # if has_unknown_header or wrong_order: # return content # if u'=== Морфологические и синтаксические свойства ===' in absent: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # if absent and headers and absent[0] == headers[0]: # # print u'{} #{}'.format(title, lang) # # print '\n'.join(headers) # # print # return content # print '-' * 40 return new_content
def lang_action(self, page, lang, content): title = page.title if title.startswith('-') or title.endswith('-'): return content if title.startswith('*'): return content new_content = content # print page.title, '=' * 100 # print content # print '=' * 100 parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() sections = [ {'header2': '', 'content': parts.pop(0)} ] sections += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] for data in sections: # print '-' * 80 header2 = data['header2'] # print header2 if not header2: continue p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE) m = p.match(header2) if not m: # print header2, '==', '#' * 120 return content # print '-' * 40 # print data['content'] # print '-' * 80 # return content # print '=' * 100 # print content # print '=' * 100 # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content, # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content, # flags=re.MULTILINE) # for part in parts: # print '-' * 100 # print part # print '-' * 100 # self.stop() # return content # sections = [ # {'lang': '', 'content': parts.pop(0)} # ] # sections += [ # {'lang': part[0], 'content': part[1]} # for part in chunks(parts, 2) # ] if lang == 'ru': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] for data in sections: section_content = data['content'] headers = convert_headers(self.get_headers(section_content)) if headers == []: # print 'EMPTY' pass elif headers == template: # print 'OK' pass else: wrong_order = False has_unknown_header = False absent = list() for header in template: if header not in headers: # print header, ' -> ABSENT WARNING' # append_dict_list(current_absent, lang, header) # self.all_absent_headers.add(header) absent.append(header) pass t = 0 s = 0 wrong_order_error = None while t < len(template) and s < len(headers): if template[t] == headers[s]: t += 1 s += 1 else: if template[t] in absent: t += 1 else: # print headers[s], ' -> WRONG ORDER ERROR' wrong_order_error = headers[s] wrong_order = True break unknown_headers = list() for header in headers: if header not in template: # print header, ' -> UNKNOWN ERROR' unknown_headers.append(header) has_unknown_header = True # append_dict_list(current_absent, lang, header) absent_semantic_headers = False for h in semantic_headers: if h in absent: absent_semantic_headers = True if absent_semantic_headers or wrong_order_error: # print u'{} #{} {}'.format(title, lang, data['header2']) # print '\n'.join(headers) # if absent: # print "\n".join([u"{} -> ABSENT WARNING".format(header) # for header in absent]) # if wrong_order_error: # print u"{} -> WRONG ORDER ERROR".format(wrong_order_error) # if unknown_headers: # print "\n".join([u"{} -> UNKNOWN ERROR".format(header) # for header in unknown_headers]) # print pass elif unknown_headers: pass else: m = re.search(u'(==== *Значение *==== *\n(.*?)' u'==== *Синонимы *==== *\n(.*?)' u'==== *Антонимы *==== *\n(.*?)' u'==== *Гиперонимы *==== *\n(.*?)' u'==== *Гипонимы *==== *(.*?)' u'\n)===[^=]', section_content, re.UNICODE | re.DOTALL) if not m: print '#' * 200 print u'title={}, lang={}'.format(title, lang) print '#' * 200 continue # raise Exception(u'title={}, lang={}'.format(title, lang)) # print title, '|', lang semantic_section = m.group(1) new_semantic_section = semantic_section # print '=' * 40 # print semantic_section # print '-' * 40 mining = m.group(2)#.strip().split('\n') # mining = filter(lambda x: x not in ['#', '# '], mining) # mining_len = len(mining) mining_len = 0 for line in mining.split('\n'): line = line.strip() if not line.strip(): continue if line.strip() in ['#']: continue if re.match('^#', line): mining_len += 1 continue if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line, re.UNICODE): continue if re.match(u'^\{\{(длина слова|илл)\|[^}]+\}\}$', line, re.UNICODE): # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ # format(title, lang, line) continue mining_len += 1 if re.match(u'^\[\[\]\]$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$', line, re.UNICODE): continue ok = False if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): ok = True if ok: # new_semantic_section = \ # new_semantic_section.replace( # u"\n{}\n".format(line), # u'\n# {}\n'.format(line) # ) continue has_strange = True print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\ format(title, lang, line) # print line if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE): continue onim_lens = [0, 0, 0, 0] bodies = [m.group(3), m.group(4), m.group(5), m.group(6), ] for i, body in enumerate(bodies): # onim_lens[i] = len(lines) # print body body = body.strip() lines = body.split('\n') fake = False for line in lines: if not line.strip(): continue if line.strip() in ['#']: continue onim_lens[i] += 1 if re.match('^#', line): continue # print title, '|', lang, u' -> "{}"'.format(line) if re.match('^[*:]', line): continue if line in ['-', '?', ]: continue if re.match('^\[\[[^]]+\]\]$', line): continue if re.match('^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$', line): continue if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE): pass if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE): pass fake = True # print title, '|', lang # print '->', line # print line # break for i, onim_len in enumerate(onim_lens): if onim_len > mining_len: onim_type = [u'синонимов', u'антонимов', u'гиперонимов', u'гипонимов', ] print u"# [[{}]] (секция \"{}\"): '''{}''' значений, '''{}''' {}".\ format(title, lang, mining_len, onim_len, onim_type[i]) return new_content
def run(self): if not self.wikt_data_page or not self.description: raise NotImplementedError() m = re.search(u':Cinemantique/(.+)', self.wikt_data_page) name = m.group(1) on_value = u'* [[%s|%s]] = on' % (self.wikt_data_page, name) c = get_wiki_page_content(u'Участник:Cinemantique/bot') if on_value not in c: print u'bot offline -> exit' return c = get_wiki_page_content(self.wikt_data_page).strip() if not c: return self.add_report(u'Бот запущен', 'silver') self.save_report('started') data = {} items = c.split('\n\n') for item in items: title, values = self.get_item(item, data) if values is None: continue data[title] = values i = 0 for title, values in sorted(data.items(), key=lambda x: x[0]): i += 1 print i # print title content = self.get_page_content(title) if content is None: continue parts = re.split('^= *\{\{-([-\w]+)-(?:\|[^}]*)?\}\} *=$', content, flags=re.MULTILINE) parts.pop(0) sections = [ {'lang': part[0], 'content': part[1]} for part in chunks(parts, 2) ] section_content = '' for section in sections: if section['lang'] == 'ru': section_content = section['content'] res = re.findall('\n== *[^=].*[^=] *==\n', section_content) # print len(res) # for r in res: # print r.encode('utf-8') if len(res) > 1: parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", section_content, flags=re.MULTILINE) sections2 = [ {'header2': '', 'content': parts.pop(0)} ] sections2 += [ {'header2': part[0], 'content': part[1]} for part in chunks(parts, 2) ] count_third = 0 for data in sections2: if '===' in data['content']: count_third += 1 if count_third > 1: self.add_report(u'В статье "[[%s]]" содержатся омонимы, пропускаем.' % title, 'maroon') section_content = None break if not section_content: if section_content == '': self.add_report(u'В статье "[[%s]]" не найдены русские заголовки, пропускаем.' % title, 'maroon') continue new_section_content = \ self.get_new_section_content(title, values, section_content) if new_section_content is None: continue new_content = content.replace(section_content, new_section_content) self.make_change(title, content, new_content) self.add_report(u'Статья "[[%s]]" успешно обновлена.' % title, 'green') self.add_report(u'Бот завершён', 'silver') self.save_report('finished') save_wiki_page(self.wikt_data_page, '', u'Удаление обработанного содержимого - [[%s/report|Отчёт]]' % self.wikt_data_page)