def section_action(self, page, lang, section_content): # if self.i > 10000: self.stop() super(WithoutMining, self).section_action(page, lang, section_content) title = page.title if ':' in title: return if re.search(u'=== *Семантические свойства *===', section_content, re.UNICODE) \ and not re.search(u'====? *Значение *====?', section_content, re.UNICODE): # print '=' * 100 # print page.title, lang # print '-' * 100 # print section_content # print '-' * 100 if u'# {{значение\n' in section_content: key = u'Шаблон "значение"' else: key = u'Возможно ошибка' # Временный фикс: # new_section_content = re.sub( # u'=== *Семантические свойства *===\n', # u'=== Семантические свойства ===\n==== Значение ====\n', # section_content) # old_content = get_wiki_page_content(page.title) # new_content = old_content.replace(section_content, # new_section_content) # if new_content != old_content: # desc = u'Добавление недостающего заголовка' # save_wiki_page(title, new_content, desc, wait=5) append_dict_list(self.without_mining[key], title, lang)
def lang_action(self, page, lang, content): # if self.i > 10000: self.stop() super(TemplatesMining, self).lang_action(page, lang, content) title = page.title if title.startswith(u'Шаблон:'): return sections = P.section_mining.findall(content) for section in sections: items = P.template_any.findall(section[1]) for item in items: tpl = item[0] values = item[1].split('|') name = values[0] filtered = G.mining.templates.all + G.mining.labels.all # if name not in filtered and name.strip() in filtered \ # and name.strip() != u'семантика': # append_dict_list( # self.mining_templates[u'Пробелы в названии шаблона'], # name, (title, tpl)) name = name.strip() if name not in filtered: # self.counts.setdefault(name, 0) # self.counts[name] += 1 append_dict_list( self.mining_templates[u'Шаблоны в "Значение"'], name, (title, tpl))
def content_action(self, page, content, redirect, **kwargs): title = page.title p = re.compile( u"""(\{\{ (?P<title>морфо[^|]*) # заголовок (?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE) parts = p.findall(content) for part in parts: params = part[2] # try: # print params # except UnicodeEncodeError: # # print '\n', '#' * 100, '\n', repr(params), '\n', '#' * 100 # print '###', repr(params) values = part[2].strip().split('|') for value in values: value = value.strip() if '=' in value: param, value = value.split('=', 1) # try: # print value # except UnicodeEncodeError: # print '#', repr(value) if '-' in value: # try: # print params # except UnicodeEncodeError: # print '###', repr(params) append_dict_list(self.morpho_wrong_hyphens_2, title, part[0]) break
def section_action(self, page, lang, section_content): super(WrongStress, self).section_action(page, lang, section_content) title = page.title if lang != 'ru': return m = re.search(u'([бвгдйклмнпрстфхцчшщьъ]́)', section_content, re.UNICODE) if m: append_dict_list(self.wrong_stress, title, m.group(1))
def section_action(self, page, lang, section_content): super(WrongHeaderCollocation, self).section_action(page, lang, section_content) title = page.title if ' ' not in title: return m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=', section_content, re.UNICODE | re.DOTALL) if m: append_dict_list(self.wrong_header_collocation, title, '')
def section_action(self, page, lang, section_content): super(WrongHeaderWord, self).section_action(page, lang, section_content) title = page.title if ' ' in title: return m = re.search(u'(=== *Тип и синтаксические свойства сочетания *=== *(.*?)\n)=', section_content, re.UNICODE | re.DOTALL) if m: append_dict_list(self.wrong_header_word, title, '')
def lang_action(self, page, lang, content): super(SecondLevelOnlyOne, self).lang_action(page, lang, content) title = page.title if ":" in title: return if title == u"Заглавная страница": return items = re.findall("^==[^=].+== *$", content, flags=re.MULTILINE) if len(items) == 1: header = items[0] append_dict_list(self.second_level_only_one, title, (lang, header))
def header_action(self, page, before, header, after): super(SyntaxErrors, self).header_action(page, before, header, after) title = page.title if ':' in title: return if title == u"Заглавная страница": # todo: move it to HeadersIterator return full_header = self.get_header(before, header, after) if before != after: append_dict_list(self.syntax_errors, title, full_header) if settings.ALLOW_CYR_PRINT: print '#syntax_errors:', title, u'—', full_header
def section_action(self, page, lang, section_content): super(MiningWrongFigureBrackets, self).section_action(page, lang, section_content) title = page.title m = re.search(u'(==== *Значение *==== *(.*?)\n)=', section_content, re.UNICODE | re.DOTALL) if not m: return # if self.i > 20000: # self.stop() body = m.group(2) if len(re.findall('\{', body)) != len(re.findall('\}', body)): # print u(title) append_dict_list(self.wront_figure_brackets, title, body)
def section_action(self, page, lang, section_content): super(WithoutAbbrevRu, self).section_action(page, lang, section_content) title = page.title if lang != 'ru': return m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=', section_content, re.UNICODE | re.DOTALL) if not m: return if title != title.upper(): return body = m.group(2) if u'{{abbrev' not in body and u'{{сокращ' not in body: append_dict_list(self.without_abbrev, title, '')
def section_action(self, page, lang, section_content): super(ContentMixedCyrlLatn, self).section_action(page, lang, section_content) title = page.title # if self.i > 3000: # self.stop() results = P.mixed_cyrl_latn.findall(section_content) if not results: return for found in results: values = re.findall(u'\s\S*%s\S*\s' % found, section_content) # print title.encode('utf-8'), found.encode('utf-8') for value in values: # if u'{{морфо' in value and (u'j' in found or u'ʲ' in found): # continue if u'j' in found or u'ʲ' in found: continue value = value.strip() if re.search(u'.*[a-fX]СВ.*', value): continue # print title.encode('utf-8'), value.encode('utf-8') # print if value.startswith('|'): append_dict_list(self.content_mixed_cyrl_latn[u'Начинается с "верт. черта"'], title, value) elif value.startswith('['): append_dict_list(self.content_mixed_cyrl_latn[u'Начинается с "квадр. скобка"'], title, value) elif value.startswith('{'): append_dict_list(self.content_mixed_cyrl_latn[u'Начинается с "фигурн. скобка"'], title, value) else: append_dict_list(self.content_mixed_cyrl_latn[u'Остальные результаты'], title, value)
def header_action(self, page, before, header, after): super(WrongLevel, self).header_action(page, before, header, after) title = page.title if ":" in title: return if title == u"Заглавная страница": # todo: move it to HeadersIterator return full_header = self.get_header(before, header, after) level = len(before) header = header.strip() correct_level = header_levels.get(header, 0) if correct_level: correct_levels = [correct_level] else: correct_levels = [] has_affix = False for s in [u"{{suffix ", u"{{morph", u"Словообразовательная единица"]: if s in page.content: has_affix = True break if title.startswith("-") or title.endswith("-"): # это точно словообразовательная единица if header in [u"Значение", u"Антонимы", u"Синонимы"]: correct_levels = [3, 4] elif title.startswith("*"): # это точно праязык if header == u"Значение": correct_levels = [3, 4] elif has_affix: if header in [u"Значение", u"Антонимы", u"Синонимы"]: correct_levels = [3, 4] elif len(title) == 1: if header in [u"Значение"]: correct_levels = [3, 4] if correct_levels and level > 1: if level not in correct_levels: # skips = [ # # u'= {{-Hani-}} =', u'= {{-hani-}} =', u'= {{-hanzi-', # # u'= {{-mul-}} =', u'= {{-INT-}} =', u'= {{-Zmth-}} =', # # u'{{suffix ', u'{{morph', u'== Буква ==', # u'Словообразовательная единица', # ] # for skip in skips: # if skip in page.content: # return append_dict_list(self.wrong_level, title, full_header) if settings.ALLOW_CYR_PRINT: print "#wrong_level:", title, u"—", full_header
def mining_line_action(self, title, lang, line): super(MiningOther, self).mining_line_action(title, lang, line) if ignore_start(line) or ignore_italic(line): return if "{" in line or line.startswith("|"): return if re.match(u"^\[\[(Файл:|File:|Image:|Изображение:).+\]\]$", line, re.UNICODE): key = u"Файл или Image" elif line == "[[]]": key = u"только пустые квадратные скобки" elif "[" in line: key = u"квадратные скобки" else: key = u"остальное" append_dict_list(self.mining_other, key, (title, lang, line))
def section_action(self, page, lang, section_content): super(WithoutMorphoHeaderRu, self).section_action(page, lang, section_content) title = page.title if lang != 'ru': return if ' ' in title or title.startswith('-') or title.endswith('-') or title.startswith('*'): return if u'{{Форма-' in section_content: return if not re.search('===', section_content): return m = re.search(u'=== *Морфологические и синтаксические свойства *===', section_content, re.UNICODE | re.DOTALL) if not m: append_dict_list(self.without_morpho_header, title, '')
def header_action(self, page, before, header, after): super(SecondLevelWrong, self).header_action(page, before, header, after) title = page.title if ':' in title: return if title == u"Заглавная страница": return full_header = self.get_header(before, header, after) header = header.strip() if before == '==' and after == '==': p = re.compile(u'^\{\{(з|заголовок)[^}]*\}\}$', re.UNICODE) m = p.match(header) if not m: if settings.ALLOW_CYR_PRINT: print '#second_level_wrong:', title, u'—', full_header append_dict_list(self.second_level_wrong, title, full_header)
def section_action(self, page, lang, section_content): super(HeadersDuplicates, self).section_action(page, lang, section_content) title = page.title if ':' in title: return for header, level in header_levels.items(): found = re.findall( u'{} *{} *{}'.format('=' * level, header, '=' * level), section_content ) if len(found) > 1: append_dict_list(self.headers_duplicates, title, (header, lang)) if settings.ALLOW_CYR_PRINT: print '#headers_duplicates:', title, u'—', header
def lang_action(self, page, lang, content): # if self.i > 10000: self.stop() super(MiningTemplatesSpaces, self).lang_action(page, lang, content) title = page.title sections = P.section_mining.findall(content) for section in sections: items = P.template_any.findall(section[1]) for item in items: tpl = item[0] values = item[1].split('|') name = values[0] filtered = G.mining.templates.sure_sure + G.mining.labels.all if name not in filtered and name.strip() in filtered \ and name.strip() != u'семантика': append_dict_list( self.mining_templates_spaces, name, (title, tpl))
def section_action(self, page, lang, section_content): super(MorphoWrongHyphens, self).section_action(page, lang, section_content) title = page.title # if title.startswith('-') or title.endswith('-'): # return # if title.startswith('*'): # return m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=', # u'\n)===', section_content, re.UNICODE | re.DOTALL) if not m: return body = m.group(2) p = re.compile( u"""(\{\{ (?P<title>морфо\s*) # заголовок \|(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE) parts = p.findall(body) for part in parts: # if part[1] == u'морфо-ru': # continue params = part[2] # try: # print params # except UnicodeEncodeError: # # print '\n', '#' * 100, '\n', repr(params), '\n', '#' * 100 # print '###', repr(params) values = part[2].strip().split('|') for value in values: value = value.strip() if '=' in value: param, value = value.split('=', 1) # try: # print value # except UnicodeEncodeError: # print '#', repr(value) if '-' in value: # try: # print params # except UnicodeEncodeError: # print '###', repr(params) append_dict_list(self.morpho_wrong_hyphens, title, part[0]) break
def content_action(self, page, content, redirect, **kwargs): super(OtherSyntaxErrors, self).content_action(page, content, redirect, **kwargs) title = page.title if ':' in title: return if title == u"Заглавная страница": # todo: move it to HeadersIterator return m = re.search('^(=+.*=+)([^=\n]+)$', content, re.MULTILINE) if m: ok = m.group(1) wrong = m.group(2) wrong = wrong.replace(' ', ' ') wrong = wrong.replace('\t', ' ' * 4) line = u"<nowiki>{}</nowiki><span style='background-color: #FFBBBB'><nowiki>{}</nowiki></span>".format(ok, wrong) append_dict_list(self.other_syntax_errors, title, line) if settings.ALLOW_CYR_PRINT: print '#syntax_errors:', title, u'—', line
def section_action(self, page, lang, section_content): super(WithoutMorphoRu, self).section_action(page, lang, section_content) title = page.title if lang != 'ru': return m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=', section_content, re.UNICODE | re.DOTALL) if not m: return body = m.group(2) if u'{{abbrev' in body or u'{{сокращ' in body or u'Аббревиатура' in body: return p = re.compile( u"""(\{\{ (?P<title>морфо(-ru)?\s*) # заголовок \|[^}]* # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE) if not p.search(body): append_dict_list(self.without_morpho, title, '')
def lang_action(self, page, lang, content): # if self.i > 10000: self.stop() super(CategoryTemplate, self).lang_action(page, lang, content) title = page.title items = P.template_category.findall(content) # if not items: # append_dict_list(self.category_template[u'Шаблон отсутствует'], # title, lang) for item in items: tpl = item[0] args = item[1] value = u"%s: %s" % (lang, tpl) if u'|язык=' not in args: append_dict_list( self.category_template[u'Параметр "язык" отсутствует'], title, value) continue if not args.startswith(u'|язык='): append_dict_list( self.category_template[u'Параметр "язык" не первый'], title, value) continue m = re.search(u'\|язык=([^|}]*)', args, re.UNICODE) lang_value = m.group(1) if not lang_value: append_dict_list( self.category_template[u'Параметр "язык" пустой'], title, value) continue if lang_value != lang: if u'{{-%s-' % lang_value in page.content: append_dict_list( self.category_template[ u'Язык не соответствует (другая секция)'], title, value) else: append_dict_list( self.category_template[ u'Язык не соответствует (секции нет)'], title, value) continue
def mining_line_action(self, title, lang, line): super(MiningSpecificStart, self).mining_line_action(title, lang, line) if re.match(r'^[0-9]', line): key = u'цифра' elif line.startswith(':'): key = u'двоеточие' elif line.startswith('*'): key = u'звёздочка' elif line.startswith('<!--'): key = u'комментарий' elif line.startswith('<'): key = u'тег' elif re.match(u'^[А-Я]', line, re.UNICODE): key = u'большая буква' elif re.match(r'^[a-z]', line, re.IGNORECASE): key = u'латиница' elif re.match(u'^[-«¤,.■"—]', line, re.UNICODE): key = u'необычный символ' else: return append_dict_list(self.mining_start, key, (title, lang, line))
def lang_action(self, page, lang, content): # if self.i > 10000: self.stop() super(LengthTemplate, self).lang_action(page, lang, content) title = page.title if u' ' in title: return items = P.template_length.findall(content) # if not items: # append_dict_list(self.category_template[u'Шаблон отсутствует'], # title, lang) for item in items: tpl = item[0] args = item[1] value = u"%s: %s" % (lang, tpl) if not re.match(u'\|\d+\|[^}|+]', args): append_dict_list( self.length_template[u'Параметры расположены нестандартно'], title, value) continue m = re.search(u'\|(\d+)\|(lang=)?([^|}]+)', args, re.UNICODE) lang_value = m.group(3) len_value = m.group(1) # if not lang_value: # append_dict_list( # self.category_template[u'Параметр языка пустой'], # title, value) # continue if lang_value != lang: if u'{{-%s-' % lang_value in page.content: append_dict_list( self.length_template[u'Язык не соответствует (другая секция)'], title, value) else: append_dict_list( self.length_template[u'Язык не соответствует (секции нет)'], title, value) len_title = len(title) # len_title -= len(re.findall(u'-', title)) if len_title != int(len_value): # key = u'Длина возможно неверная/Возможно {}'.format(len_title) # self.length_template.setdefault(key, dict()) # if key not in self.length_template_list: # self.length_template_list.append(key) # append_dict_list( # self.length_template[key], # title, value) # + (u", len=%s?" % len_title)) append_dict_list( self.length_template[u'Длина возможно неверная'], title, value + (u" -> %s" % len_title))
def mining_line_action(self, title, lang, line): super(MiningFigure, self).mining_line_action(title, lang, line) if ignore_start(line) or ignore_italic(line): return if re.match(u'^\{\{прото\|.*$', line, re.UNICODE): return # key = u'прото' elif re.match(u'^\{\{Нужен перевод *(\|\w+)?\}\}$$', line, re.UNICODE): key = u'Нужен перевод' elif re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line, re.UNICODE): key = u'длина слова' elif re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line, re.UNICODE): key = u'шаблон илл' elif line.startswith(u'}} {{пример') \ or line.startswith(u'{{списки семантических связей')\ or line.startswith(u'|'): key = u'списки семантических связей' elif re.match(u'^\{\{(музы|месяцы|неделя) \w+\}\}$', line, re.UNICODE): key = u'музы, месяцы, неделя' elif "{" in line: key = u'другие' else: return append_dict_list(self.mining_figure, key, (title, lang, line))
def section_action(self, page, lang, section_content): super(OldMorphoRu, self).section_action(page, lang, section_content) title = page.title if lang != 'ru': return m = re.search(u'(=== *Морфологические и синтаксические свойства *=== *(.*?)\n)=', section_content, re.UNICODE | re.DOTALL) if not m: return body = m.group(2) p = re.compile( u"""(\{\{ (?P<title>морфо\s*) # заголовок \|(?P<params>[^{}]+?(\{\{[^{}]+?\}\})?[^{}]+?) # параметры \}\})""", flags=re.UNICODE + re.DOTALL + re.VERBOSE) parts = p.findall(body) for part in parts: # params = part[2] # params = re.sub(u'(префд|прист|суфф-?|корень|инф|соед|оконч-?|интер|суффд|частица|постфикс)\d*=', '', params) # params = re.sub(u'\|', '', params).strip() # if not params: # append_dict_list(self.old_empty_morpho, title, part[0]) append_dict_list(self.old_morpho, title, part[0])
def section_action(self, page, lang, section_content): super(ContentMixedCyrlLatnExtra, self).section_action(page, lang, section_content) title = page.title # if self.i > 3000: # self.stop() results = regex.findall(u'\p{IsCyrillic}\s\p{isLatin}\s|\s\p{isLatin}\s\p{IsCyrillic}', section_content, re.UNICODE) if not results: return for found in results: values = re.findall(u'\s\S*%s\S*\s' % found, section_content) # # print title.encode('utf-8'), found.encode('utf-8') for value in values: if check_in([u'I', u'X', u'V'], found): append_dict_list(self.content_mixed_cyrl_latn_extra[u'Roman'], title, value) elif check_in([u'c', u'o'], found): append_dict_list(self.content_mixed_cyrl_latn_extra[u'Wrong?'], title, value) elif re.search(u'[bdfghkmnqrstuvwzDFGLNQRSUVWYZ]', found): append_dict_list(self.content_mixed_cyrl_latn_extra[u'Okay?'], title, value) else: append_dict_list(self.content_mixed_cyrl_latn_extra[u'Other'], title, value)
def lang_action(self, page, lang, content): # if self.i > 100: self.stop() super(LabelsLanguageInMining, self).lang_action(page, lang, content) title = page.title # print page.pk # print '=' * 100 # print title, lang # print '-' * 100 # print page.content # print '-' * 100 # print content # print '-' * 100 sections = P.section_mining.findall(content) for section in sections: body = section[1] items = P.template_any.findall(body) for item in items: tpl = item[0] values = item[1].split('|') report_value = u"%s: %s" % (lang, tpl) name = values[0].strip() if name not in G.mining.labels.all: continue # проверить перен. на зачин? и на язык? if name in G.mining.labels.args_2 and len(values) > 3 \ and name not in G.mining.labels.args_3 \ or name in G.mining.labels.args_1 and len(values) > 2 \ or name in G.mining.labels.args_0 and len(values) > 1: append_dict_list( self.labels_language_IM[u'Лишние параметры'], title, report_value) continue lang_value = values[1] if len(values) > 1 else '' if name in (G.mining.labels.args_1 + G.mining.labels.args_2): if not lang_value: append_dict_list( self.labels_language_IM[u'Параметр языка пустой'], title, report_value) continue if lang_value != lang: append_dict_list( self.labels_language_IM[u'Язык не соответствует'], title, report_value) continue
def sub_section_action(self, page, lang, sub_header, sub_section_content): # if self.i > 100000: self.stop() super(LabelsLanguageNotInMining, self).\ sub_section_action(page, lang, sub_header, sub_section_content) title = page.title if u'Значение' in sub_header: return items = P.template_any.findall(sub_section_content) for item in items: tpl = item[0] values = item[1].split('|') report_value = (tpl, lang, sub_header.replace('=', '').strip()) name = values[0].strip() if name not in G.mining.labels.all: continue # проверить перен. на зачин? и на язык? if name in G.mining.labels.args_2 and len(values) > 3 \ and name not in G.mining.labels.args_3 \ or name in G.mining.labels.args_1 and len(values) > 2 \ or name in G.mining.labels.args_0 and len(values) > 1: append_dict_list( self.labels_language_NIM[u'Лишние параметры'], title, report_value) continue lang_value = values[1] if len(values) > 1 else '' if name in (G.mining.labels.args_1 + G.mining.labels.args_2): if not lang_value: append_dict_list( self.labels_language_NIM[u'Параметр языка пустой'], title, report_value) continue if lang_value != '-': append_dict_list( self.labels_language_NIM[u'Язык не соответствует'], title, report_value) continue
def section_action(self, page, lang, section_content): super(NoMorphology, self).section_action(page, lang, section_content) title = page.title if title.startswith('-') or title.endswith('-'): return if title.startswith('*'): return if lang in ['INT', 'mul', 'Zmth', 'Hani', 'hani', 'hanzi']: return headers = convert_headers(self.get_headers(section_content)) if not headers: return if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers: append_dict_list(self.no_morphology[u'Все результаты'], title, lang) if section_content.strip().startswith(u'Существительное') \ or section_content.strip().startswith(u'Прилагательное') \ or section_content.strip().startswith(u'Глагол') \ or section_content.strip().startswith(u'Наречие'): append_dict_list(self.no_morphology[u'Часть речи'], title, lang) elif re.search('^\{\{(сущ|прил|гл|adv|падежи|нар|interj) ', section_content.strip(), re.UNICODE): append_dict_list(self.no_morphology[u'Шаблон часть речи'], title, lang) elif section_content.strip().startswith(u'{{Форма'): append_dict_list(self.no_morphology[u'Шаблон форма'], title, lang) elif section_content.strip().startswith(u'{{длина слова'): append_dict_list(self.no_morphology[u'Шаблон длина слова'], title, lang) elif section_content.strip().startswith(u'<b>'): append_dict_list(self.no_morphology[u'Жирность1'], title, lang) elif section_content.strip().startswith(u"'''"): append_dict_list(self.no_morphology[u'Жирность2'], title, lang) elif u'Тип и синтаксические свойства сочетания' in section_content: append_dict_list(self.no_morphology[u'Словосочетания'], title, lang) elif section_content.strip().startswith(u'==='): append_dict_list(self.no_morphology[u'Пусто'], title, lang) else: append_dict_list(self.no_morphology[u'Остальное'], title, lang) if settings.ALLOW_CYR_PRINT: print u'absent # [[{}]] (секция "{}")'.format(title, lang)
def headers_action(self, page, headers, redirect): super(WrongStructure, self).header_action(page, page, headers, redirect) title = page.title if ':' in title: return if title == u"Заглавная страница": # todo: move it to HeadersIterator return if redirect: return # print '=' * 120 # print title # print "\n".join(convert(headers)) # print '-' * 120 current_absent = dict() current_wrong_order = dict() has_unknown_header = False wrong_order = False langs, lang_sections = self.group_headers(headers) for lang in langs: section = lang_sections[lang] # print u"#{}".format(lang) converted_section = convert_headers(section) if lang == u'{{-ru-}}': if ' ' in title: template = templates['ru']['phrase'] else: template = templates['ru']['word'] else: if ' ' in title: template = templates['xx']['phrase'] else: template = templates['xx']['word'] if converted_section == []: # print 'EMPTY' pass elif converted_section == template: # print 'OK' pass else: _current_absent = list() for header in template: if header not in converted_section: # print header, ' -> ABSENT WARNING' append_dict_list(current_absent, lang, header) self.absent_headers.add(header) t = 0 s = 0 while t < len(template) and s < len(converted_section): if template[t] == converted_section[s]: t += 1 s += 1 else: if template[t] in _current_absent: t += 1 else: wrong_order = True for header in converted_section: if header not in template: # print header, ' -> UNKNOWN ERROR' has_unknown_header = True # append_dict_list(current_absent, lang, header) # print '-' * 40 # print if current_absent and not has_unknown_header and not wrong_order: self.header_absent[title] = current_absent if not self.i % 1000: print len(self.header_absent) if not self.i % 10000: for header in self.absent_headers: print header