Beispiel #1
0
 def process(self):
     i = 0
     for chunk in chunks(self.words, self.chunk_len):
         i += self.chunk_len
         if not i % 10000:
             print i
         self._process_chunk(chunk)
Beispiel #2
0
 def bulk(self, items, model=None, chunk_size=1000):
     if not model:
         model = self.model
     processed = 0
     for chunk in chunks(items, chunk_size):
         processed += len(model.objects.bulk_create(chunk))
         print dt(), '-> Processed:', processed
Beispiel #3
0
 def prepare_langs(self):
     parts = P.lang_header.split(self.content)
     self.langs.append('')
     self.lang_contents[''] = parts.pop(0)
     for part in chunks(parts, 2):
         lang = part[0]
         self.langs.append(lang)
         self.lang_contents[lang] = part[1]
Beispiel #4
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content

        new_content = content
        for data in sections:
            section_content = data['content']

            headers = convert_headers(self.get_headers(section_content))
            if not headers:
                continue

            if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers:
                # print u'# [[{}]] (секция "{}")'.format(title, lang)
                # print '=' * 120
                # print section_content
                if section_content.strip().startswith(u'<b>'):
                    new_section_content = \
                        re.sub(u'^\s*<b>',
                               u'\n=== Морфологические и синтаксические свойства ===\n<b>',
                               section_content)
                    new_content = new_content.replace(section_content,
                                                      new_section_content)
                elif section_content.strip().startswith(u'{{падежи '):
                    new_section_content = \
                        re.sub(u'^\s*\{\{падежи ',
                               u'\n=== Морфологические и синтаксические свойства ===\n{{падежи ',
                               section_content)
                    new_content = new_content.replace(section_content,
                                                      new_section_content)
        return new_content
Beispiel #5
0
 def update_words_details(self):
     words = self.words.values_list('word', flat=True)
     field_name = self.details_field_name()
     kwargs = {field_name: 1}
     print len(words), 'by', 1000
     i = 0
     for chunk in chunks(words, 1000):
         i += 1
         print i
         Word_Stats.objects.filter(word__in=chunk).update(**kwargs)
Beispiel #6
0
 def prepare_sections(self):
     for lang, lang_content in self.lang_contents.items():
         contents = self.sections[lang] = dict()
         order = self.second_headers[lang] = list()
         parts = P.second_header.split(lang_content)
         order.append('')
         contents[''] = parts.pop(0)
         for part in chunks(parts, 2):
             second_header = part[0]
             order.append(second_header)
             contents[second_header] = part[1]
Beispiel #7
0
def parse_block_content(content):
    parts = re.split('\n=== ([^=]*) ===\n', content)
    content = parts[0]
    sub_data = dict()
    for part in chunks(parts[1:], 2):
        sub_title = part[0]
        sub_content = part[1]
        if sub_title in sub_data:
            print '×' * 20, 'Subtitle duplicate: %s' % sub_title
        sub_data[sub_title] = sub_content
    return content, sub_data
Beispiel #8
0
 def content_action(self, page, content, redirect, **kwargs):
     super(BaseLanguageIterator, self).content_action(page, content,
                                                      redirect, **kwargs)
     parts = P.lang_header.split(content)
     sections = [
         {'lang': '', 'content': parts.pop(0)}
     ]
     sections += [
         {'lang': part[0], 'content': part[1]}
         for part in chunks(parts, 2)
     ]
     self.langs_action(page, sections)
Beispiel #9
0
 def lang_action(self, page, lang, content):
     super(BasePageSubSectionIterator, self).lang_action(page, lang, content)
     parts = re.split("^((?:==)(?:.*)(?:==) *)$", content,
                      flags=re.MULTILINE)
     sub_sections = [
         {'sub_header': '', 'content': parts.pop(0)}
     ]
     sub_sections += [
         {'sub_header': part[0], 'content': part[1]}
         for part in chunks(parts, 2)
     ]
     return self.sub_sections_action(page, lang, sub_sections, content)
Beispiel #10
0
 def process(self):
             self.dictionary.words_count = len(self.words)
             self.dictionary.save()
     # return
     # for chunk_len in [2000, 1000, 500, 200, 100]:
     #     try:
     #         chunk_len = 200
             self.i = 0
             self.n = len(self.words) / self.chunk_len
             for chunk in chunks(self.words, self.chunk_len):
                 self.i += 1
                 self._process_chunk(chunk)
             return
Beispiel #11
0
    def lang_action(self, page, lang, content):
        super(BasePageSectionIterator, self).lang_action(page, lang, content)
        parts = P.second_header.split(content)
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]

        # check if all headers are correct
        if not self.check_if_all_headera_are_correct(sections):
            return
        self.sections_action(page, lang, sections, content)
Beispiel #12
0
 def prepare_sub_sections(self):
     for lang, lang_data in self.sections.items():
         self.sub_headers[lang] = dict()
         self.sub_sections[lang] = dict()
         for header2, content2 in lang_data.items():
             contents = self.sub_sections[lang][header2] = dict()
             order = self.sub_headers[lang][header2] = list()
             parts = re.split("^((?:==)(?:.*)(?:==) *)$", content2,
                              flags=re.MULTILINE)
             order.append('')
             contents[''] = parts.pop(0)
             for part in chunks(parts, 2):
                 sub_header = part[0]
                 m = re.match(u"^(=+)([^=]+)(=+) *$", sub_header)
                 sub_header = u"%s %s %s" % \
                              (m.group(1), m.group(2).strip(), m.group(3))
                 order.append(sub_header)
                 contents[sub_header] = part[1]
Beispiel #13
0
def parse_mass_edit(content):
    parts = re.split('\n== ([^=]*) ==\n', content)
    blocks = dict()
    sub_blocks = dict()
    for part in chunks(parts[1:], 2):
        title = part[0].strip()
        content = part[1]
        content, sub_data = parse_block_content(content)
        m = re.match(u'^\[\[([^]]+)\]\]$', title)
        if not m:
            print '×' * 20, 'Wrong title: need [[...]], found: %s' % title
            continue
        title = m.group(1)
        if title in blocks:
            print '×' * 20, 'Title duplicate: %s' % title
            continue
        blocks[title] = content
        sub_blocks[title] = sub_data
        # print '-' * 100
        # print title
        # print '-' * 100
        # print content
    return blocks, sub_blocks
def get_unknown_inflection_full():
    items = WordInflection.objects.filter(kind__in=[u'f ?', u'm ?', u'n ?'])
    words = sorted(items, key=lambda item: item.word[::-1])
    report = dict()
    for word in words:
        print word.word
        valid_num = check_correct_inflection(word.word, word.gender)
        if valid_num < 0:
            continue
        key = "%s%d" % (word.gender, valid_num)
        report.setdefault(key, list())
        report[key].append((word.word, word.content))

    file_path = join(PAGES_DIR, u'Массовое редактирование')
    wiki_prefix = u"Участник:Vitalik/Массовое редактирование/Словоизменение/сущ"
    desc = u"Полное обновление данных"
    for key in report.keys():
        print key
        items = report.get(key)
        i = 1
        page_count = len(items) / 100 + 1
        for chunk in chunks(items, 100):
            content = gen_report_full(chunk, key[0], key[1], page_count)
            filename = "%s_%d.txt" % (key, i)
            print filename
            debug_write(file_path, filename, content)
            # sys.exit()
            wiki_title = "%s/%s/%s" % (wiki_prefix, key, i)
            changed = save_wiki_page(wiki_title, content, desc)
            if changed:
                db_title = u'сущ/%s/%s' % (key, i)
                item, created = WordInflectionMassEdit.objects.get_or_create(
                    title=db_title)
                item.content = content
                item.save()
            i += 1
Beispiel #15
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            # elif headers == template:
            #     # print 'OK'
            #     pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                # if absent_semantic_headers or wrong_order_error:
                #     # print u'{} #{} {}'.format(title, lang, data['header2'])
                #     # print '\n'.join(headers)
                #     # if absent:
                #     #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                #     #                      for header in absent])
                #     # if wrong_order_error:
                #     #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                #     # if unknown_headers:
                #     #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                #     #                      for header in unknown_headers])
                #     # print
                #     pass
                # elif unknown_headers:
                #     pass
                # else:

                if title in [u'օժանդակ բայ']:
                    return content

                if True:
                    m = re.search(u'(==== *Значение *==== *(.*?)'
                                  u'\n)=',
                                  # u'\n)===',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        continue
                        # raise Exception(u'title={}, lang={}'.format(title, lang))

                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section

                    mining = m.group(2)
                    # if mining.strip() == u'[[]]\n{{Нужен перевод}}':
                    #     print section_content
                    #     print '=' * 120

                    has_strange = False

                    for line in mining.split('\n'):
                        line = line.strip()
                        if not line.strip():
                            continue
                        if re.match('^#', line):
                            continue

                        if "''" in line:
                            new_line = \
                                re.sub(u"''(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)\.''",
                                       u'{{{{\\1.|{}}}}}'.format(lang), line)
                            if line != new_line:
                                if "''" not in new_line:
                                    new_line = u'# ' + new_line
                                print line
                                print new_line
                                print
                                new_semantic_section = \
                                    new_semantic_section.replace(line, new_line)

                        if "{" in line or "''" in line or "<i>" in line or line.startswith("|"):
                            continue

                        # if re.match('^\*', line):
                        #     new_semantic_section = \
                        #         new_semantic_section.replace(
                        #             u"\n{}\n".format(line),
                        #             u'\n#{}\n'.format(line[1:])
                        #         )

                        if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line,
                                    re.UNICODE):
                            continue
                        # if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line,
                        #             re.UNICODE):

                        if re.match(u'^\[\[\]\]$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue
                        if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue
                        if re.match(u'^\[\[(Файл:|File:|Image:|Изображение:)[^]]+\]\]$', line,
                                    re.UNICODE):
                            # todo: [[Файл: и прочие IMG
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        ok = False
                        # if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                        #             re.UNICODE):
                        #     ok = True

                        if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( *\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True

                        # if not re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE) \
                        #         and re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE):
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, line)

                        if ok:
                            # new_semantic_section = \
                            #     new_semantic_section.replace(
                            #         u"\n{}\n".format(line),
                            #         u'\n# {}\n'.format(line)
                            #     )
                            continue

                        has_strange = True

                        if lang == 'la':
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "''" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "<i>" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif line.startswith(u'}} {{пример') \
                                or line.startswith(u'{{списки семантических связей')\
                                or line.startswith(u'|'):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "{" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "[" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif u'Аналогично русскому' in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        else:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass

                        if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue

                    # if not has_strange:
                    #     new_content = new_content.replace(semantic_section,
                    #                                       new_semantic_section)
                    new_content = new_content.replace(semantic_section,
                                                      new_semantic_section)

        return new_content
Beispiel #16
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            header2 = data['header2']
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                print header2, '==', '#' * 120
                return content

        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                c = 0
                for header in headers:
                    if header == u'==== Значение ====':
                        c += 1
                if c > 1:
                    print title, '$' * 200
                    return content
                    # raise Exception('c > 1')

                if u'=== Семантические свойства ===' not in headers \
                        and u'==== Значение ====' in headers:
                    # print title, '/', lang
                    # print '\n'.join(headers)

                    page_content = page.content
                    lst = re.findall(u'==== Значение ====', page_content)
                    # if len(lst) == 1:
                    #     print title

                    if len(lst) > 1:
                        new_section_content = section_content.replace(
                            u'\n==== Значение ====\n',
                            u'\n=== Семантические свойства ===\n\n==== Значение ====\n',
                        )
                        new_content = new_content.replace(section_content,
                                                          new_section_content)
        return new_content
Beispiel #17
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            # elif headers == template:
            #     # print 'OK'
            #     pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                # if absent_semantic_headers or wrong_order_error:
                #     # print u'{} #{} {}'.format(title, lang, data['header2'])
                #     # print '\n'.join(headers)
                #     # if absent:
                #     #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                #     #                      for header in absent])
                #     # if wrong_order_error:
                #     #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                #     # if unknown_headers:
                #     #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                #     #                      for header in unknown_headers])
                #     # print
                #     pass
                # elif unknown_headers:
                #     pass
                # else:
                if True:
                    m = re.search(u'(==== *Значение *==== *(.*?)'
                                  u'\n)===',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        # print '#' * 100
                        # print u'title={}, lang={}'.format(title, lang)
                        # print '#' * 100
                        continue
                        # raise Exception(u'title={}, lang={}'.format(title, lang))

                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section

                    mining = m.group(2)
                    # if mining.strip() == u'[[]]\n{{Нужен перевод}}':
                    #     print section_content
                    #     print '=' * 120

                    has_strange = False

                    for line in mining.split('\n'):
                        line = line.strip()

                        # items = re.findall('\{\{[^}]+\}\}', line)
                        # for item in items:
                        #     if not item.startswith(u'{{пример|'):
                        #         print item

                        # items = re.findall(u'\{\{помета\|[^}]+\}\}', line)
                        # items = re.findall(u'\{\{помета\|[^}|]*\|[^}]*\}\}', line)
                        # for item in items:
                        #     print item

                        # items = re.findall(u'\{\{спорт.\|[^}]*вид[^}]*\}\}', line)
                        # for item in items:
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, item)

                        # items = re.findall(u'\{\{субстантивир\.\|[^}]*\|[^}]*\}\}', line)
                        # for item in items:
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, item)

                        # items = re.findall(u'\{\{ласк\..*\}\}', line)
                        # for item in items:
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, item)

                        if line.startswith('#'):
                            print line

                        if not line.strip():
                            continue
                        if re.match('^#', line):
                            continue

                        if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                        # if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line,
                        #             re.UNICODE):
                        # if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line,
                        #             re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        # todo: [[Файл: и прочие IMG

                        if re.match(u'^\[\[\]\]$', line,
                                    re.UNICODE):
                            continue

                        if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line,
                                    re.UNICODE):
                            continue

                        ok = False
                        if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True

                        if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True
                        if ok:
                            # new_semantic_section = \
                            #     new_semantic_section.replace(
                            #         u"\n{}\n".format(line),
                            #         u'\n# {}\n'.format(line)
                            #     )
                            continue

                        has_strange = True

                        if lang == 'la':
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "''" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "<i>" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "{" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "[" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        else:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass

                        if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue

                    # if not has_strange:
                    #     new_content = new_content.replace(semantic_section,
                    #                                       new_semantic_section)

        return new_content
Beispiel #18
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                else:
                    m = re.search(u'==== *Значение *==== *\n(.*?)'
                                  u'==== *Синонимы *==== *\n(.*?)'
                                  u'==== *Антонимы *==== *\n(.*?)'
                                  u'==== *Гиперонимы *==== *\n(.*?)'
                                  u'==== *Гипонимы *==== *(.*?)'
                                  u'\n===[^=]',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        # print title, '|', lang, '=' * 40
                        # print section_content
                        # print '-' * 80
                        if lang == 'ru':
                            if ' ' in title:
                                tail_contains = "\n".join([
                                    template_contents[u'Этимология/phrase'],
                                    template_contents[u'Перевод'],
                                    template_contents[u'Библиография'],
                                ])
                            else:
                                tail_contains = "\n".join([
                                    template_contents[u'Родственные слова'],
                                    template_contents[u'Этимология/ru'],
                                    template_contents[u'Фразеологизмы'],
                                    template_contents[u'Перевод'],
                                    template_contents[u'Библиография'],
                                ])
                        else:
                            if ' ' in title:
                                tail_contains = "\n".join([
                                    template_contents[u'Этимология/phrase'],
                                    template_contents[u'Библиография'],
                                ])
                            else:
                                tail_contains = "\n".join([
                                    template_contents[u'Родственные слова'],
                                    template_contents[u'Этимология/xx'].format(lang),
                                    template_contents[u'Фразеологизмы'],
                                    template_contents[u'Библиография'],
                                ])

                        p = re.compile(u'(==== *Гипонимы *====\n[^[{]*)')

                        m2 = re.search(u'(==== *Гипонимы *====(.*))',
                                       section_content, flags=re.DOTALL | re.UNICODE)
                        if m2:
                            if '# [' in m2.group(1):
                                print title, '%' * 200
                                print m2.group(1)
                                continue
                        else:
                            print title, '!' * 100
                        new_section_content = p.sub('\\1' + '\n' + tail_contains + '\n',
                                                    section_content)
                        new_section_content = new_section_content.replace('\n\n\n', '\n\n')
                        new_content = new_content.replace(section_content,
                                                          new_section_content)

                        # print new_content
                        # print '-' * 120
                        # print '\n'.join(headers)
                        # print '-' * 120
                        # print section_content
                        # print '-' * 120
                        # print

                # if has_unknown_header or wrong_order:
                #     return content
                # if u'=== Морфологические и синтаксические свойства ===' in absent:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content
                # if absent and headers and absent[0] == headers[0]:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content

            # print '-' * 40

        return new_content
Beispiel #19
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith("-") or title.endswith("-"):
            return content
        if title.startswith("*"):
            return content

        new_content = content

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [{"header2": "", "content": parts.pop(0)}]
        sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)]
        for data in sections:
            # print '-' * 80
            header2 = data["header2"]
            # print header2
            if not header2:
                continue
            p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == "ru":
            if " " in title:
                template = templates["ru"]["phrase"]
            else:
                template = templates["ru"]["word"]
        else:
            if " " in title:
                template = templates["xx"]["phrase"]
            else:
                template = templates["xx"]["word"]

        for data in sections:
            section_content = data["content"]
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                elif unknown_headers:
                    pass
                else:
                    m = re.search(
                        u"(==== *Значение *==== *\n(.*?)"
                        u"==== *Синонимы *==== *\n(.*?)"
                        u"==== *Антонимы *==== *\n(.*?)"
                        u"==== *Гиперонимы *==== *\n(.*?)"
                        u"==== *Гипонимы *==== *(.*?)"
                        u"\n)===[^=]",
                        section_content,
                        re.UNICODE | re.DOTALL,
                    )
                    if not m:
                        print "#" * 200
                        print u"title={}, lang={}".format(title, lang)
                        print "#" * 200
                        # continue
                        raise Exception(u"title={}, lang={}".format(title, lang))

                    # print title, '|', lang
                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section
                    # print '=' * 40
                    # print semantic_section
                    # print '-' * 40

                    mining = m.group(2)
                    for line in mining.split("\n"):
                        line = line.strip()
                        if not line.strip():
                            continue
                        if re.match("^#", line):
                            continue

                        if re.match(u"^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$", line, re.UNICODE):
                            continue
                        if re.match(u"^\{\{(длина слова|илл\.?)\|[^}]+\}\}$", line, re.UNICODE):
                            print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line)
                            continue

                        if re.match(u"^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$", line, re.UNICODE):
                            continue

                        if re.match(u"^\[\[\]\]$", line, re.UNICODE):
                            continue

                        if re.match(u"^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE):
                            continue

                        if re.match(
                            u"^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE
                        ):
                            continue

                        # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #     format(title, lang, line)
                        # print line

                        if re.match(u"^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE):
                            continue
                        if re.match(
                            u"^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$",
                            line,
                            re.UNICODE,
                        ):
                            continue

                    bodies = [m.group(3), m.group(4), m.group(5), m.group(6)]
                    for body in bodies:
                        lines = body.split("\n")
                        fake = False
                        for line in lines:
                            if not line.strip():
                                continue
                            if re.match("^#", line):
                                continue

                            # print title, '|', lang, ' -> ', line

                            if re.match("^[*:]", line):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n#{}\n".format(line[1:])
                                )
                            elif not line.startswith("<!--"):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n# {}\n".format(line)
                                )

                            if re.match("^[*:]", line):
                                continue
                            if line in ["-", "?"]:
                                continue
                            if re.match("^\[\[[^]]+\]\]$", line):
                                continue
                            if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line):
                                continue
                            if re.match(u"^[a-zа-я !]+$", line, re.UNICODE | re.IGNORECASE):
                                pass
                            if re.match(u"^([a-zа-я !]+([,;] )?)+$", line, re.UNICODE | re.IGNORECASE):
                                pass
                            fake = True
                            # print title, '|', lang
                            # print '->', line
                            # print line
                            break

        return new_content
Beispiel #20
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith("-") or title.endswith("-"):
            return content
        if title.startswith("*"):
            return content

        new_content = content

        # print title, lang
        # if title == u'высокопарность':
        #     print 'ok'

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [{"header2": "", "content": parts.pop(0)}]
        sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)]
        for data in sections:
            # print '-' * 80
            header2 = data["header2"]
            # print header2
            if not header2:
                continue
            p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == "ru":
            if " " in title:
                template = templates["ru"]["phrase"]
            else:
                template = templates["ru"]["word"]
        else:
            if " " in title:
                template = templates["xx"]["phrase"]
            else:
                template = templates["xx"]["word"]

        for data in sections:
            section_content = data["content"]
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                elif unknown_headers:
                    pass
                else:
                    m = re.search(
                        u"(==== *Значение *==== *\n(.*?)"
                        u"==== *Синонимы *==== *\n(.*?)"
                        u"==== *Антонимы *==== *\n(.*?)"
                        u"==== *Гиперонимы *==== *\n(.*?)"
                        u"==== *Гипонимы *==== *(.*?)"
                        u"\n)===[^=]",
                        section_content,
                        re.UNICODE | re.DOTALL,
                    )
                    if not m:
                        # print title, '|', lang, '=' * 120
                        # print '\n'.join(headers)
                        # print
                        # raise Exception('!!!')
                        continue

                    # print title, '|', lang
                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section
                    # print '=' * 40
                    # print semantic_section
                    # print '-' * 40
                    bodies = [m.group(3), m.group(4), m.group(5), m.group(6)]
                    for body in bodies:
                        # print body
                        # body = body.strip()
                        lines = body.split("\n")
                        fake = False
                        for line in lines:
                            if not line.strip():
                                continue
                            if re.match("^#", line):
                                continue

                            # print title, '|', lang, u' -> "{}"'.format(line)

                            if re.match("^[*:]", line):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n#{}\n".format(line[1:])
                                )
                            elif not line.startswith("<!--"):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n# {}\n".format(line)
                                )

                            if re.match("^[*:]", line):
                                continue
                            if line in ["-", "?"]:
                                continue
                            if re.match("^\[\[[^]]+\]\]$", line):
                                continue
                            if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line):
                                continue
                            # if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE):
                            #     pass
                            # if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE):
                            #     pass
                            fake = True
                            print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line)
                            # print title, '|', lang
                            # print '->', line
                            # print line
                            break
                        # if not fake: # or True:
                        # if semantic_section != new_semantic_section:
                        #     print title, '|', lang
                        #     print '=' * 100
                        #     print semantic_section
                        #     print '-' * 100
                        #     print new_semantic_section
                        #     print '-' * 100
                        #     print
                        # new_content = \
                        #     new_content.replace(semantic_section,
                        #                         new_semantic_section)

                        # if fake and body:
                        #     print title, '|', lang
                        #     print '=' * 120
                        #     # print '"{}"'.format(tail)
                        #     print body
                        #     print '-' * 120
                        #     print

                # if has_unknown_header or wrong_order:
                #     return content
                # if u'=== Морфологические и синтаксические свойства ===' in absent:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content
                # if absent and headers and absent[0] == headers[0]:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content

            # print '-' * 40

        return new_content
Beispiel #21
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                elif unknown_headers:
                    pass
                else:
                    m = re.search(u'(==== *Значение *==== *\n(.*?)'
                                  u'==== *Синонимы *==== *\n(.*?)'
                                  u'==== *Антонимы *==== *\n(.*?)'
                                  u'==== *Гиперонимы *==== *\n(.*?)'
                                  u'==== *Гипонимы *==== *(.*?)'
                                  u'\n)===[^=]',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        print '#' * 200
                        print u'title={}, lang={}'.format(title, lang)
                        print '#' * 200
                        continue
                        # raise Exception(u'title={}, lang={}'.format(title, lang))

                    # print title, '|', lang
                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section
                    # print '=' * 40
                    # print semantic_section
                    # print '-' * 40

                    mining = m.group(2)#.strip().split('\n')
                    # mining = filter(lambda x: x not in ['#', '# '], mining)
                    # mining_len = len(mining)
                    mining_len = 0
                    for line in mining.split('\n'):
                        line = line.strip()
                        if not line.strip():
                            continue
                        if line.strip() in ['#']:
                            continue
                        if re.match('^#', line):
                            mining_len += 1
                            continue

                        if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^\{\{(длина слова|илл)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        mining_len += 1
                        if re.match(u'^\[\[\]\]$', line,
                                    re.UNICODE):
                            continue

                        if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$', line,
                                    re.UNICODE):
                            continue

                        ok = False
                        if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True

                        if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True
                        if ok:
                            # new_semantic_section = \
                            #     new_semantic_section.replace(
                            #         u"\n{}\n".format(line),
                            #         u'\n# {}\n'.format(line)
                            #     )
                            continue

                        has_strange = True

                        print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            format(title, lang, line)
                        # print line

                        if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue

                    onim_lens = [0, 0, 0, 0]
                    bodies = [m.group(3), m.group(4), m.group(5), m.group(6), ]
                    for i, body in enumerate(bodies):
                        # onim_lens[i] = len(lines)
                        # print body
                        body = body.strip()
                        lines = body.split('\n')
                        fake = False
                        for line in lines:
                            if not line.strip():
                                continue
                            if line.strip() in ['#']:
                                continue

                            onim_lens[i] += 1

                            if re.match('^#', line):
                                continue

                            # print title, '|', lang, u' -> "{}"'.format(line)

                            if re.match('^[*:]', line):
                                continue
                            if line in ['-', '?', ]:
                                continue
                            if re.match('^\[\[[^]]+\]\]$', line):
                                continue
                            if re.match('^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$', line):
                                continue
                            if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE):
                                pass
                            if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE):
                                pass
                            fake = True
                            # print title, '|', lang
                            # print '->', line
                            # print line
                            # break

                    for i, onim_len in enumerate(onim_lens):
                        if onim_len > mining_len:
                            onim_type = [u'синонимов', u'антонимов', u'гиперонимов', u'гипонимов', ]
                            print u"# [[{}]] (секция \"{}\"): '''{}''' значений, '''{}''' {}".\
                                format(title, lang, mining_len, onim_len, onim_type[i])

        return new_content
Beispiel #22
0
    def run(self):
        if not self.wikt_data_page or not self.description:
            raise NotImplementedError()
        m = re.search(u':Cinemantique/(.+)', self.wikt_data_page)
        name = m.group(1)
        on_value = u'* [[%s|%s]] = on' % (self.wikt_data_page, name)
        c = get_wiki_page_content(u'Участник:Cinemantique/bot')
        if on_value not in c:
            print u'bot offline -> exit'
            return
        c = get_wiki_page_content(self.wikt_data_page).strip()
        if not c:
            return
        self.add_report(u'Бот запущен', 'silver')
        self.save_report('started')

        data = {}
        items = c.split('\n\n')
        for item in items:
            title, values = self.get_item(item, data)
            if values is None:
                continue
            data[title] = values

        i = 0
        for title, values in sorted(data.items(), key=lambda x: x[0]):
            i += 1
            print i
            # print title
            content = self.get_page_content(title)
            if content is None:
                continue
            parts = re.split('^= *\{\{-([-\w]+)-(?:\|[^}]*)?\}\} *=$', content,
                             flags=re.MULTILINE)
            parts.pop(0)
            sections = [
                {'lang': part[0], 'content': part[1]}
                for part in chunks(parts, 2)
            ]
            section_content = ''
            for section in sections:
                if section['lang'] == 'ru':
                    section_content = section['content']
                    res = re.findall('\n== *[^=].*[^=] *==\n', section_content)
                    # print len(res)
                    # for r in res:
                    #     print r.encode('utf-8')
                    if len(res) > 1:
                        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$",
                                         section_content, flags=re.MULTILINE)
                        sections2 = [
                            {'header2': '', 'content': parts.pop(0)}
                        ]
                        sections2 += [
                            {'header2': part[0], 'content': part[1]}
                            for part in chunks(parts, 2)
                        ]
                        count_third = 0
                        for data in sections2:
                            if '===' in data['content']:
                                count_third += 1
                        if count_third > 1:
                            self.add_report(u'В статье "[[%s]]" содержатся омонимы, пропускаем.' % title, 'maroon')
                            section_content = None
                            break
            if not section_content:
                if section_content == '':
                    self.add_report(u'В статье "[[%s]]" не найдены русские заголовки, пропускаем.' % title, 'maroon')
                continue

            new_section_content = \
                self.get_new_section_content(title, values, section_content)
            if new_section_content is None:
                continue
            new_content = content.replace(section_content, new_section_content)
            self.make_change(title, content, new_content)
            self.add_report(u'Статья "[[%s]]" успешно обновлена.' % title, 'green')

        self.add_report(u'Бот завершён', 'silver')
        self.save_report('finished')
        save_wiki_page(self.wikt_data_page, '', u'Удаление обработанного содержимого - [[%s/report|Отчёт]]' % self.wikt_data_page)