コード例 #1
0
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content

        new_content = content
        for data in sections:
            section_content = data['content']

            headers = convert_headers(self.get_headers(section_content))
            if not headers:
                continue

            if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers:
                # print u'# [[{}]] (секция "{}")'.format(title, lang)
                # print '=' * 120
                # print section_content
                if section_content.strip().startswith(u'<b>'):
                    new_section_content = \
                        re.sub(u'^\s*<b>',
                               u'\n=== Морфологические и синтаксические свойства ===\n<b>',
                               section_content)
                    new_content = new_content.replace(section_content,
                                                      new_section_content)
                elif section_content.strip().startswith(u'{{падежи '):
                    new_section_content = \
                        re.sub(u'^\s*\{\{падежи ',
                               u'\n=== Морфологические и синтаксические свойства ===\n{{падежи ',
                               section_content)
                    new_content = new_content.replace(section_content,
                                                      new_section_content)
        return new_content
コード例 #2
0
ファイル: 9_new_template_bot1.py プロジェクト: 2vitalik/words
    def section_action(self, page, lang, section_content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return
        if title.startswith('*'):
            return

        headers = convert_headers(self.get_headers(section_content))
        if not headers:
            return

        m = re.search(u'\n(=== *Семантические свойства *===\n+'
                      u'==== *Значение *====\n(?P<mining>.*?)'
                      u'==== *Синонимы *====\n(?P<syn>.*?)'
                      u'==== *Антонимы *====\n(?P<ant>.*?)'
                      u'==== *Гиперонимы *====\n(?P<gyper>.*?)'
                      u'==== *Гипонимы *====(?P<gyp>.*?))'
                      u'\n===[^=]',
                      section_content, re.UNICODE | re.DOTALL)
        if m:
            mining = m.group('mining').strip()
            syn = m.group('syn').strip()
            ant = m.group('ant').strip()
            gyper = m.group('gyper').strip()
            gyp = m.group('gyp').strip()
            m2 = re.search(u"^(?P<first_line># *\{\{(?P<label>(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)"
                           u"\.)(\|(?P<lang>[a-z]{2,3}))\}\} *"
                           u"(?P<mining>[^{}'<>\n]*) *(?P<example>\{\{пример\|[^}]*\}\}))\n#$", mining)
            if not m2:
                return
                # print '=' * 40
                # print m.group(1).strip()
                # print '-' * 40
                # print mining
                # print '-' * 40
                # print m2.group(0)
                # print '=' * 40
            for block in [syn, ant, gyper, gyp]:
                # if not re.search("^#[^{}'<>\n]*\n#$", block):
                # if not re.search("^#\s*\n#$", block) \
                #         and not re.search("^# *(\[\[[^]]+\]\]([,;] )?)+\n#$", block) :
                if not re.search("^# *(?P<value>(\[\[[^]]+\]\]([,;] )?)*)\n#$", block) :
                    return
            # if m2.group('lang') != lang:
            #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
            #         format(title, lang, m2.group('first_line'))
            #     # raise Exception(title)

            print '=' * 40
            print m.group(1).strip()
            print '-' * 40
コード例 #3
0
ファイル: 7_morphology.py プロジェクト: 2vitalik/words
    def section_action(self, page, lang, section_content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return
        if title.startswith('*'):
            return

        headers = convert_headers(self.get_headers(section_content))
        if not headers:
            return

        if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers:
            print u'# [[{}]] (секция "{}")'.format(title, lang)
コード例 #4
0
ファイル: A_new_template_bot2.py プロジェクト: 2vitalik/words
    def section_action(self, page, lang, section_content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return
        if title.startswith('*'):
            return

        headers = convert_headers(self.get_headers(section_content))
        if not headers:
            return

        p = re.compile(u'\n(=== *Семантические свойства *===\n+'
                       u'==== *Значение *====\n\s*(?P<first_line># *\{\{(?P<label>(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)'
                       u'\.)(\|(?P<lang>[a-z]{2,3}))\}\} *'
                       u"(?P<mining>[^{}'<>\n]*) *(?P<example>\{\{пример\|[^}]*\}\}))\n#\s*"
                       u'==== *Синонимы *====\n\s*# *(?P<syn>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*'
                       u'==== *Антонимы *====\n\s*# *(?P<ant>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*'
                       u'==== *Гиперонимы *====\n\s*# *(?P<hyper>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*'
                       u'==== *Гипонимы *====\s*# *(?P<hyp>(\[\[[^]]+\]\]([,;] )?)*)\n#\s*)'
                       u'\n===(?P<tail>[^=])',
                       re.UNICODE | re.DOTALL)
        m = p.search(section_content)
        if m:
            old = m.group(1)
            new = p.sub(u"""
=== Семантические свойства ===
# {{значение
  |определение = \g<mining>
  |пометы      = [\g<label>]
  |примеры     = \g<example>
  |синонимы    = \g<syn>
  |конверсивы  =
  |антонимы    = \g<ant>
  |гиперонимы  = \g<hyper>
  |гипонимы    = \g<hyp>
  |согипонимы  =
  |холонимы    =
  |меронимы    =
  |управление  =
  |категории   =
  |якорь       =
  |язык        = \g<lang>
}}')
""", u"\n{}\n=== ".format(old))
            print title
コード例 #5
0
ファイル: absent.py プロジェクト: 2vitalik/words
    def section_action(self, page, lang, section_content):
        super(NoMorphology, self).section_action(page, lang, section_content)
        title = page.title
        if title.startswith('-') or title.endswith('-'):
            return
        if title.startswith('*'):
            return

        if lang in ['INT', 'mul', 'Zmth', 'Hani', 'hani', 'hanzi']:
            return

        headers = convert_headers(self.get_headers(section_content))
        if not headers:
            return

        if ' ' not in title and u'=== Морфологические и синтаксические свойства ===' not in headers:
            append_dict_list(self.no_morphology[u'Все результаты'], title, lang)
            if section_content.strip().startswith(u'Существительное') \
                    or section_content.strip().startswith(u'Прилагательное') \
                    or section_content.strip().startswith(u'Глагол') \
                    or section_content.strip().startswith(u'Наречие'):
                append_dict_list(self.no_morphology[u'Часть речи'], title, lang)
            elif re.search('^\{\{(сущ|прил|гл|adv|падежи|нар|interj) ', section_content.strip(), re.UNICODE):
                append_dict_list(self.no_morphology[u'Шаблон часть речи'], title, lang)
            elif section_content.strip().startswith(u'{{Форма'):
                append_dict_list(self.no_morphology[u'Шаблон форма'], title, lang)
            elif section_content.strip().startswith(u'{{длина слова'):
                append_dict_list(self.no_morphology[u'Шаблон длина слова'], title, lang)
            elif section_content.strip().startswith(u'<b>'):
                append_dict_list(self.no_morphology[u'Жирность1'], title, lang)
            elif section_content.strip().startswith(u"'''"):
                append_dict_list(self.no_morphology[u'Жирность2'], title, lang)
            elif u'Тип и синтаксические свойства сочетания' in section_content:
                append_dict_list(self.no_morphology[u'Словосочетания'], title, lang)
            elif section_content.strip().startswith(u'==='):
                append_dict_list(self.no_morphology[u'Пусто'], title, lang)
            else:
                append_dict_list(self.no_morphology[u'Остальное'], title, lang)
            if settings.ALLOW_CYR_PRINT:
                print u'absent # [[{}]] (секция "{}")'.format(title, lang)
コード例 #6
0
ファイル: 8_mining.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            # elif headers == template:
            #     # print 'OK'
            #     pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                # if absent_semantic_headers or wrong_order_error:
                #     # print u'{} #{} {}'.format(title, lang, data['header2'])
                #     # print '\n'.join(headers)
                #     # if absent:
                #     #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                #     #                      for header in absent])
                #     # if wrong_order_error:
                #     #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                #     # if unknown_headers:
                #     #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                #     #                      for header in unknown_headers])
                #     # print
                #     pass
                # elif unknown_headers:
                #     pass
                # else:
                if True:
                    m = re.search(u'(==== *Значение *==== *(.*?)'
                                  u'\n)===',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        # print '#' * 100
                        # print u'title={}, lang={}'.format(title, lang)
                        # print '#' * 100
                        continue
                        # raise Exception(u'title={}, lang={}'.format(title, lang))

                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section

                    mining = m.group(2)
                    # if mining.strip() == u'[[]]\n{{Нужен перевод}}':
                    #     print section_content
                    #     print '=' * 120

                    has_strange = False

                    for line in mining.split('\n'):
                        line = line.strip()

                        # items = re.findall('\{\{[^}]+\}\}', line)
                        # for item in items:
                        #     if not item.startswith(u'{{пример|'):
                        #         print item

                        # items = re.findall(u'\{\{помета\|[^}]+\}\}', line)
                        # items = re.findall(u'\{\{помета\|[^}|]*\|[^}]*\}\}', line)
                        # for item in items:
                        #     print item

                        # items = re.findall(u'\{\{спорт.\|[^}]*вид[^}]*\}\}', line)
                        # for item in items:
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, item)

                        # items = re.findall(u'\{\{субстантивир\.\|[^}]*\|[^}]*\}\}', line)
                        # for item in items:
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, item)

                        # items = re.findall(u'\{\{ласк\..*\}\}', line)
                        # for item in items:
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, item)

                        if line.startswith('#'):
                            print line

                        if not line.strip():
                            continue
                        if re.match('^#', line):
                            continue

                        if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                        # if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line,
                        #             re.UNICODE):
                        # if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line,
                        #             re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        # todo: [[Файл: и прочие IMG

                        if re.match(u'^\[\[\]\]$', line,
                                    re.UNICODE):
                            continue

                        if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line,
                                    re.UNICODE):
                            continue

                        ok = False
                        if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True

                        if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True
                        if ok:
                            # new_semantic_section = \
                            #     new_semantic_section.replace(
                            #         u"\n{}\n".format(line),
                            #         u'\n# {}\n'.format(line)
                            #     )
                            continue

                        has_strange = True

                        if lang == 'la':
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "''" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "<i>" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "{" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "[" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        else:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass

                        if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue

                    # if not has_strange:
                    #     new_content = new_content.replace(semantic_section,
                    #                                       new_semantic_section)

        return new_content
コード例 #7
0
ファイル: 4_semantic_absent.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            header2 = data['header2']
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                print header2, '==', '#' * 120
                return content

        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                c = 0
                for header in headers:
                    if header == u'==== Значение ====':
                        c += 1
                if c > 1:
                    print title, '$' * 200
                    return content
                    # raise Exception('c > 1')

                if u'=== Семантические свойства ===' not in headers \
                        and u'==== Значение ====' in headers:
                    # print title, '/', lang
                    # print '\n'.join(headers)

                    page_content = page.content
                    lst = re.findall(u'==== Значение ====', page_content)
                    # if len(lst) == 1:
                    #     print title

                    if len(lst) > 1:
                        new_section_content = section_content.replace(
                            u'\n==== Значение ====\n',
                            u'\n=== Семантические свойства ===\n\n==== Значение ====\n',
                        )
                        new_content = new_content.replace(section_content,
                                                          new_section_content)
        return new_content
コード例 #8
0
ファイル: 6_check_counts.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                elif unknown_headers:
                    pass
                else:
                    m = re.search(u'(==== *Значение *==== *\n(.*?)'
                                  u'==== *Синонимы *==== *\n(.*?)'
                                  u'==== *Антонимы *==== *\n(.*?)'
                                  u'==== *Гиперонимы *==== *\n(.*?)'
                                  u'==== *Гипонимы *==== *(.*?)'
                                  u'\n)===[^=]',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        print '#' * 200
                        print u'title={}, lang={}'.format(title, lang)
                        print '#' * 200
                        continue
                        # raise Exception(u'title={}, lang={}'.format(title, lang))

                    # print title, '|', lang
                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section
                    # print '=' * 40
                    # print semantic_section
                    # print '-' * 40

                    mining = m.group(2)#.strip().split('\n')
                    # mining = filter(lambda x: x not in ['#', '# '], mining)
                    # mining_len = len(mining)
                    mining_len = 0
                    for line in mining.split('\n'):
                        line = line.strip()
                        if not line.strip():
                            continue
                        if line.strip() in ['#']:
                            continue
                        if re.match('^#', line):
                            mining_len += 1
                            continue

                        if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^\{\{(длина слова|илл)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        mining_len += 1
                        if re.match(u'^\[\[\]\]$', line,
                                    re.UNICODE):
                            continue

                        if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$', line,
                                    re.UNICODE):
                            continue

                        ok = False
                        if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True

                        if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True
                        if ok:
                            # new_semantic_section = \
                            #     new_semantic_section.replace(
                            #         u"\n{}\n".format(line),
                            #         u'\n# {}\n'.format(line)
                            #     )
                            continue

                        has_strange = True

                        print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            format(title, lang, line)
                        # print line

                        if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue

                    onim_lens = [0, 0, 0, 0]
                    bodies = [m.group(3), m.group(4), m.group(5), m.group(6), ]
                    for i, body in enumerate(bodies):
                        # onim_lens[i] = len(lines)
                        # print body
                        body = body.strip()
                        lines = body.split('\n')
                        fake = False
                        for line in lines:
                            if not line.strip():
                                continue
                            if line.strip() in ['#']:
                                continue

                            onim_lens[i] += 1

                            if re.match('^#', line):
                                continue

                            # print title, '|', lang, u' -> "{}"'.format(line)

                            if re.match('^[*:]', line):
                                continue
                            if line in ['-', '?', ]:
                                continue
                            if re.match('^\[\[[^]]+\]\]$', line):
                                continue
                            if re.match('^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$', line):
                                continue
                            if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE):
                                pass
                            if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE):
                                pass
                            fake = True
                            # print title, '|', lang
                            # print '->', line
                            # print line
                            # break

                    for i, onim_len in enumerate(onim_lens):
                        if onim_len > mining_len:
                            onim_type = [u'синонимов', u'антонимов', u'гиперонимов', u'гипонимов', ]
                            print u"# [[{}]] (секция \"{}\"): '''{}''' значений, '''{}''' {}".\
                                format(title, lang, mining_len, onim_len, onim_type[i])

        return new_content
コード例 #9
0
ファイル: 5_mining.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            # elif headers == template:
            #     # print 'OK'
            #     pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                # if absent_semantic_headers or wrong_order_error:
                #     # print u'{} #{} {}'.format(title, lang, data['header2'])
                #     # print '\n'.join(headers)
                #     # if absent:
                #     #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                #     #                      for header in absent])
                #     # if wrong_order_error:
                #     #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                #     # if unknown_headers:
                #     #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                #     #                      for header in unknown_headers])
                #     # print
                #     pass
                # elif unknown_headers:
                #     pass
                # else:

                if title in [u'օժանդակ բայ']:
                    return content

                if True:
                    m = re.search(u'(==== *Значение *==== *(.*?)'
                                  u'\n)=',
                                  # u'\n)===',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        continue
                        # raise Exception(u'title={}, lang={}'.format(title, lang))

                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section

                    mining = m.group(2)
                    # if mining.strip() == u'[[]]\n{{Нужен перевод}}':
                    #     print section_content
                    #     print '=' * 120

                    has_strange = False

                    for line in mining.split('\n'):
                        line = line.strip()
                        if not line.strip():
                            continue
                        if re.match('^#', line):
                            continue

                        if "''" in line:
                            new_line = \
                                re.sub(u"''(авиац|австрал|автомоб|автомоб. жарг|агрон|алхим|альп|амер|анат|антроп|артилл|археол|архит|астрол|астрон|безл|библейск|биол|биохим|бирж|болг|ботан|браз|бранн|брит|бухг|вет|вин|военн|военн. жарг|вульг|высок|гастрон|генет|геогр|геод|геол|геометр|геофиз|геральд|гидрол|горн|грам|груб|детск|диал|дигорск|дипл|дисфм|доминик|дор|ед. ч|ест|ж.-д|жарг|живоп|зоол|игр|интернет|информ|ион|ирл|ирон|искаж|искусств|исп|истор|исч|ихтиол|йогич|канадск|канц|карт|картеж. жарг|керам|кинол|книжн|комп|комп. жарг|косм|космет|крим|крим. жарг|кубан|кулин|культурол|лес|лингв|лог|матем|машин|мед|металл|метеорол|метон|мех|микол|микробиол|милиц. жарг|минер|мифол|мол|морск|муз|муз. жарг|нар.-поэт|нар.-разг|научн|неисч|нем|неодобр|неодуш|неол|неофиц|неперех|неправ|нескл|нефтегаз|нидерл|нов.-зел|нумизм|образ|обсц|одобр|одуш|океан|оккульт|опт|орнитол|оскорб|офиц|охотн|палеонт|паразит|парикмах|перен|перех|плотн|полигр|полит|полит. жарг|полиц. жарг|порт|портн|поэт|презр|пренебр|прогр|прост|проф|психиатр|психол|публиц|разг|редк|рекл|религ|ритор|рыбол|с.-х|сад|сексол|сниж|собир|совет|социол|спелеол|спец|спорт|старин|стат|статив|стекловарн|стих|столярн|строит|студ. жарг|тайв|театр|текст|техн|техн. жарг|тж|типогр|тлв|торг|торж|трад.-поэт|тракторн|трансп|уважит|укр|управл|усеч|устар|фам|фант|фарм|физ|физиол|филат|филол|филос|фин|фолькл|фотогр|хим|хоз|хореогр|худ.пр|худож|церк|церк.-слав|цирк|цитол|шахм|швейн|школьн|шотл|шутл|эвф|экол|экон|эл.-техн|эл.-энерг|энтомол|эол|этногр|этнолог|ювел|юр)\.''",
                                       u'{{{{\\1.|{}}}}}'.format(lang), line)
                            if line != new_line:
                                if "''" not in new_line:
                                    new_line = u'# ' + new_line
                                print line
                                print new_line
                                print
                                new_semantic_section = \
                                    new_semantic_section.replace(line, new_line)

                        if "{" in line or "''" in line or "<i>" in line or line.startswith("|"):
                            continue

                        # if re.match('^\*', line):
                        #     new_semantic_section = \
                        #         new_semantic_section.replace(
                        #             u"\n{}\n".format(line),
                        #             u'\n#{}\n'.format(line[1:])
                        #         )

                        if re.match(u'^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$', line,
                                    re.UNICODE):
                            continue
                        # if re.match(u'^\{\{(длина слова|илл\.?)\|[^}]+\}\}$', line,
                        #             re.UNICODE):

                        if re.match(u'^\[\[\]\]$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        if re.match(u'^(\[\[[^]]*\]\] )?\{\{Нужен перевод *(\|\w+)?\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        if re.match(u'^\{\{(длина слова)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue
                        if re.match(u'^\{\{(илл\.?)\|[^}]+\}\}$', line,
                                    re.UNICODE):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue
                        if re.match(u'^\[\[(Файл:|File:|Image:|Изображение:)[^]]+\]\]$', line,
                                    re.UNICODE):
                            # todo: [[Файл: и прочие IMG
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            continue

                        ok = False
                        # if re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                        #             re.UNICODE):
                        #     ok = True

                        if re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( *\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            ok = True

                        # if not re.match(u'^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE) \
                        #         and re.match(u'^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line, re.UNICODE):
                        #     print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #         format(title, lang, line)

                        if ok:
                            # new_semantic_section = \
                            #     new_semantic_section.replace(
                            #         u"\n{}\n".format(line),
                            #         u'\n# {}\n'.format(line)
                            #     )
                            continue

                        has_strange = True

                        if lang == 'la':
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "''" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "<i>" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif line.startswith(u'}} {{пример') \
                                or line.startswith(u'{{списки семантических связей')\
                                or line.startswith(u'|'):
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "{" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif "[" in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        elif u'Аналогично русскому' in line:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass
                        else:
                            # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                            #     format(title, lang, line)
                            pass

                        if re.match(u'^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue
                        if re.match(u'^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$', line,
                                    re.UNICODE):
                            continue

                    # if not has_strange:
                    #     new_content = new_content.replace(semantic_section,
                    #                                       new_semantic_section)
                    new_content = new_content.replace(semantic_section,
                                                      new_semantic_section)

        return new_content
コード例 #10
0
ファイル: 3_check_mining.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith("-") or title.endswith("-"):
            return content
        if title.startswith("*"):
            return content

        new_content = content

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [{"header2": "", "content": parts.pop(0)}]
        sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)]
        for data in sections:
            # print '-' * 80
            header2 = data["header2"]
            # print header2
            if not header2:
                continue
            p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == "ru":
            if " " in title:
                template = templates["ru"]["phrase"]
            else:
                template = templates["ru"]["word"]
        else:
            if " " in title:
                template = templates["xx"]["phrase"]
            else:
                template = templates["xx"]["word"]

        for data in sections:
            section_content = data["content"]
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                elif unknown_headers:
                    pass
                else:
                    m = re.search(
                        u"(==== *Значение *==== *\n(.*?)"
                        u"==== *Синонимы *==== *\n(.*?)"
                        u"==== *Антонимы *==== *\n(.*?)"
                        u"==== *Гиперонимы *==== *\n(.*?)"
                        u"==== *Гипонимы *==== *(.*?)"
                        u"\n)===[^=]",
                        section_content,
                        re.UNICODE | re.DOTALL,
                    )
                    if not m:
                        print "#" * 200
                        print u"title={}, lang={}".format(title, lang)
                        print "#" * 200
                        # continue
                        raise Exception(u"title={}, lang={}".format(title, lang))

                    # print title, '|', lang
                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section
                    # print '=' * 40
                    # print semantic_section
                    # print '-' * 40

                    mining = m.group(2)
                    for line in mining.split("\n"):
                        line = line.strip()
                        if not line.strip():
                            continue
                        if re.match("^#", line):
                            continue

                        if re.match(u"^\{\{прото\|(\{\{)?[^}]+(\}\})?\}\}$", line, re.UNICODE):
                            continue
                        if re.match(u"^\{\{(длина слова|илл\.?)\|[^}]+\}\}$", line, re.UNICODE):
                            print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line)
                            continue

                        if re.match(u"^(\[\[[^]]*\]\] )?\{\{Нужен перевод(\|\w+)?\}\}$", line, re.UNICODE):
                            continue

                        if re.match(u"^\[\[\]\]$", line, re.UNICODE):
                            continue

                        if re.match(u"^\[\[[^]]+\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE):
                            continue

                        if re.match(
                            u"^(\[\[[^]]+\]\]([,;] )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE
                        ):
                            continue

                        # print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.\
                        #     format(title, lang, line)
                        # print line

                        if re.match(u"^\[\[[^]]*\]\](\s*\{\{пример\|.*(\|перевод=.*)?\}\})?$", line, re.UNICODE):
                            continue
                        if re.match(
                            u"^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;]? )?)+( \{\{пример\|.*(\|перевод=.*)?\}\})?$",
                            line,
                            re.UNICODE,
                        ):
                            continue

                    bodies = [m.group(3), m.group(4), m.group(5), m.group(6)]
                    for body in bodies:
                        lines = body.split("\n")
                        fake = False
                        for line in lines:
                            if not line.strip():
                                continue
                            if re.match("^#", line):
                                continue

                            # print title, '|', lang, ' -> ', line

                            if re.match("^[*:]", line):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n#{}\n".format(line[1:])
                                )
                            elif not line.startswith("<!--"):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n# {}\n".format(line)
                                )

                            if re.match("^[*:]", line):
                                continue
                            if line in ["-", "?"]:
                                continue
                            if re.match("^\[\[[^]]+\]\]$", line):
                                continue
                            if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line):
                                continue
                            if re.match(u"^[a-zа-я !]+$", line, re.UNICODE | re.IGNORECASE):
                                pass
                            if re.match(u"^([a-zа-я !]+([,;] )?)+$", line, re.UNICODE | re.IGNORECASE):
                                pass
                            fake = True
                            # print title, '|', lang
                            # print '->', line
                            # print line
                            break

        return new_content
コード例 #11
0
ファイル: 1_absent_headers.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith('-') or title.endswith('-'):
            return content
        if title.startswith('*'):
            return content

        new_content = content

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content,
                         flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [
            {'header2': '', 'content': parts.pop(0)}
        ]
        sections += [
            {'header2': part[0], 'content': part[1]}
            for part in chunks(parts, 2)
        ]
        for data in sections:
            # print '-' * 80
            header2 = data['header2']
            # print header2
            if not header2:
                continue
            p = re.compile(u'^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$', re.UNICODE)
            m = p.match(header2)
            if not m:
                print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == 'ru':
            if ' ' in title:
                template = templates['ru']['phrase']
            else:
                template = templates['ru']['word']
        else:
            if ' ' in title:
                template = templates['xx']['phrase']
            else:
                template = templates['xx']['word']

        for data in sections:
            section_content = data['content']
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                else:
                    m = re.search(u'==== *Значение *==== *\n(.*?)'
                                  u'==== *Синонимы *==== *\n(.*?)'
                                  u'==== *Антонимы *==== *\n(.*?)'
                                  u'==== *Гиперонимы *==== *\n(.*?)'
                                  u'==== *Гипонимы *==== *(.*?)'
                                  u'\n===[^=]',
                                  section_content, re.UNICODE | re.DOTALL)
                    if not m:
                        # print title, '|', lang, '=' * 40
                        # print section_content
                        # print '-' * 80
                        if lang == 'ru':
                            if ' ' in title:
                                tail_contains = "\n".join([
                                    template_contents[u'Этимология/phrase'],
                                    template_contents[u'Перевод'],
                                    template_contents[u'Библиография'],
                                ])
                            else:
                                tail_contains = "\n".join([
                                    template_contents[u'Родственные слова'],
                                    template_contents[u'Этимология/ru'],
                                    template_contents[u'Фразеологизмы'],
                                    template_contents[u'Перевод'],
                                    template_contents[u'Библиография'],
                                ])
                        else:
                            if ' ' in title:
                                tail_contains = "\n".join([
                                    template_contents[u'Этимология/phrase'],
                                    template_contents[u'Библиография'],
                                ])
                            else:
                                tail_contains = "\n".join([
                                    template_contents[u'Родственные слова'],
                                    template_contents[u'Этимология/xx'].format(lang),
                                    template_contents[u'Фразеологизмы'],
                                    template_contents[u'Библиография'],
                                ])

                        p = re.compile(u'(==== *Гипонимы *====\n[^[{]*)')

                        m2 = re.search(u'(==== *Гипонимы *====(.*))',
                                       section_content, flags=re.DOTALL | re.UNICODE)
                        if m2:
                            if '# [' in m2.group(1):
                                print title, '%' * 200
                                print m2.group(1)
                                continue
                        else:
                            print title, '!' * 100
                        new_section_content = p.sub('\\1' + '\n' + tail_contains + '\n',
                                                    section_content)
                        new_section_content = new_section_content.replace('\n\n\n', '\n\n')
                        new_content = new_content.replace(section_content,
                                                          new_section_content)

                        # print new_content
                        # print '-' * 120
                        # print '\n'.join(headers)
                        # print '-' * 120
                        # print section_content
                        # print '-' * 120
                        # print

                # if has_unknown_header or wrong_order:
                #     return content
                # if u'=== Морфологические и синтаксические свойства ===' in absent:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content
                # if absent and headers and absent[0] == headers[0]:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content

            # print '-' * 40

        return new_content
コード例 #12
0
ファイル: 2_wikify_onims.py プロジェクト: 2vitalik/words
    def lang_action(self, page, lang, content):
        title = page.title

        if title.startswith("-") or title.endswith("-"):
            return content
        if title.startswith("*"):
            return content

        new_content = content

        # print title, lang
        # if title == u'высокопарность':
        #     print 'ok'

        # print page.title, '=' * 100
        # print content
        # print '=' * 100
        parts = re.split("^((?:==)(?:[^=].*?[^=])(?:==))$", content, flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        sections = [{"header2": "", "content": parts.pop(0)}]
        sections += [{"header2": part[0], "content": part[1]} for part in chunks(parts, 2)]
        for data in sections:
            # print '-' * 80
            header2 = data["header2"]
            # print header2
            if not header2:
                continue
            p = re.compile(u"^==\s*\{\{(з|заголовок)[^}]*\}\}\s*==$", re.UNICODE)
            m = p.match(header2)
            if not m:
                # print header2, '==', '#' * 120
                return content
            # print '-' * 40
            # print data['content']
            # print '-' * 80
        # return content

        # print '=' * 100
        # print content
        # print '=' * 100
        # # parts = re.split("^(?P<before>=+)(?P<header>.+?)(?P<after>=+)$", content,
        # parts = re.split("^((?:=+)(?:.+?)(?:=+))$", content,
        #                  flags=re.MULTILINE)
        # for part in parts:
        #     print '-' * 100
        #     print part
        #     print '-' * 100
        # self.stop()
        # return content
        # sections = [
        #     {'lang': '', 'content': parts.pop(0)}
        # ]
        # sections += [
        #     {'lang': part[0], 'content': part[1]}
        #     for part in chunks(parts, 2)
        # ]

        if lang == "ru":
            if " " in title:
                template = templates["ru"]["phrase"]
            else:
                template = templates["ru"]["word"]
        else:
            if " " in title:
                template = templates["xx"]["phrase"]
            else:
                template = templates["xx"]["word"]

        for data in sections:
            section_content = data["content"]
            headers = convert_headers(self.get_headers(section_content))

            if headers == []:
                # print 'EMPTY'
                pass
            elif headers == template:
                # print 'OK'
                pass
            else:
                wrong_order = False
                has_unknown_header = False
                absent = list()
                for header in template:
                    if header not in headers:
                        # print header, ' -> ABSENT WARNING'
                        # append_dict_list(current_absent, lang, header)
                        # self.all_absent_headers.add(header)
                        absent.append(header)
                        pass
                t = 0
                s = 0

                wrong_order_error = None
                while t < len(template) and s < len(headers):
                    if template[t] == headers[s]:
                        t += 1
                        s += 1
                    else:
                        if template[t] in absent:
                            t += 1
                        else:
                            # print headers[s], ' -> WRONG ORDER ERROR'
                            wrong_order_error = headers[s]
                            wrong_order = True
                            break

                unknown_headers = list()
                for header in headers:
                    if header not in template:
                        # print header, ' -> UNKNOWN ERROR'
                        unknown_headers.append(header)
                        has_unknown_header = True
                        # append_dict_list(current_absent, lang, header)

                absent_semantic_headers = False
                for h in semantic_headers:
                    if h in absent:
                        absent_semantic_headers = True

                if absent_semantic_headers or wrong_order_error:
                    # print u'{} #{} {}'.format(title, lang, data['header2'])
                    # print '\n'.join(headers)
                    # if absent:
                    #     print "\n".join([u"{} -> ABSENT WARNING".format(header)
                    #                      for header in absent])
                    # if wrong_order_error:
                    #     print u"{} -> WRONG ORDER ERROR".format(wrong_order_error)
                    # if unknown_headers:
                    #     print "\n".join([u"{} -> UNKNOWN ERROR".format(header)
                    #                      for header in unknown_headers])
                    # print
                    pass
                elif unknown_headers:
                    pass
                else:
                    m = re.search(
                        u"(==== *Значение *==== *\n(.*?)"
                        u"==== *Синонимы *==== *\n(.*?)"
                        u"==== *Антонимы *==== *\n(.*?)"
                        u"==== *Гиперонимы *==== *\n(.*?)"
                        u"==== *Гипонимы *==== *(.*?)"
                        u"\n)===[^=]",
                        section_content,
                        re.UNICODE | re.DOTALL,
                    )
                    if not m:
                        # print title, '|', lang, '=' * 120
                        # print '\n'.join(headers)
                        # print
                        # raise Exception('!!!')
                        continue

                    # print title, '|', lang
                    semantic_section = m.group(1)
                    new_semantic_section = semantic_section
                    # print '=' * 40
                    # print semantic_section
                    # print '-' * 40
                    bodies = [m.group(3), m.group(4), m.group(5), m.group(6)]
                    for body in bodies:
                        # print body
                        # body = body.strip()
                        lines = body.split("\n")
                        fake = False
                        for line in lines:
                            if not line.strip():
                                continue
                            if re.match("^#", line):
                                continue

                            # print title, '|', lang, u' -> "{}"'.format(line)

                            if re.match("^[*:]", line):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n#{}\n".format(line[1:])
                                )
                            elif not line.startswith("<!--"):
                                new_semantic_section = new_semantic_section.replace(
                                    u"\n{}\n".format(line), u"\n# {}\n".format(line)
                                )

                            if re.match("^[*:]", line):
                                continue
                            if line in ["-", "?"]:
                                continue
                            if re.match("^\[\[[^]]+\]\]$", line):
                                continue
                            if re.match("^(\[\[[^]]+\]\]( \(\[\[[^]]+\]\]\))?([,;] )?)+$", line):
                                continue
                            # if re.match(u'^[a-zа-я !]+$', line, re.UNICODE | re.IGNORECASE):
                            #     pass
                            # if re.match(u'^([a-zа-я !]+([,;] )?)+$', line, re.UNICODE | re.IGNORECASE):
                            #     pass
                            fake = True
                            print u'# [[{}]] (секция "{}"): <code><nowiki>{}</nowiki></code>'.format(title, lang, line)
                            # print title, '|', lang
                            # print '->', line
                            # print line
                            break
                        # if not fake: # or True:
                        # if semantic_section != new_semantic_section:
                        #     print title, '|', lang
                        #     print '=' * 100
                        #     print semantic_section
                        #     print '-' * 100
                        #     print new_semantic_section
                        #     print '-' * 100
                        #     print
                        # new_content = \
                        #     new_content.replace(semantic_section,
                        #                         new_semantic_section)

                        # if fake and body:
                        #     print title, '|', lang
                        #     print '=' * 120
                        #     # print '"{}"'.format(tail)
                        #     print body
                        #     print '-' * 120
                        #     print

                # if has_unknown_header or wrong_order:
                #     return content
                # if u'=== Морфологические и синтаксические свойства ===' in absent:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content
                # if absent and headers and absent[0] == headers[0]:
                #     # print u'{} #{}'.format(title, lang)
                #     # print '\n'.join(headers)
                #     # print
                #     return content

            # print '-' * 40

        return new_content