Beispiel #1
0
def get_bg_abbreviation_homonyms():
    abbreviation_bg = get_string_list_from_file('out/Аббревиатура. БГ.txt')
    abbreviation_bg = sorted(list(set(abbreviation_bg)))

    socket_group_list = list(read_src_socket_bs('src_dict/БГ 13.03.21.txt'))

    bg_abbreviation_homonyms = []

    for abbreviation in abbreviation_bg:
        abbreviation_name = get_socket_word_form(abbreviation).name
        word_form_list = []
        for socket_group in socket_group_list:
            for sub_group in socket_group.sub_groups:
                for word_form in sub_group.socket_word_forms:
                    if not word_form.invisible:
                        form_name = word_form.name.replace('*', '')
                        if form_name == abbreviation_name.lower():
                            print(form_name)
                            word_form_list.append(str(word_form))
        if word_form_list:
            bg_abbreviation_homonyms.append(abbreviation)
            bg_abbreviation_homonyms += word_form_list
            bg_abbreviation_homonyms.append('')

    save_list_to_file(bg_abbreviation_homonyms[:-1],
                      'out/Аббревиатура. БГ. Омонимы.txt')
Beispiel #2
0
def get_bs_abbreviation_homonyms():
    abbreviation_bs = get_string_list_from_file('out/Аббревиатура. БС.txt')

    word_forms_bases = read_src_bs('src_dict/БС 13.03.21.txt')
    title_forms = [str(x.title_word_form) for x in word_forms_bases]

    bs_abbreviation_homonyms = []

    for abbreviation in abbreviation_bs:
        abbreviation_name = get_bs_title_word_form(abbreviation).name
        title_form_list = []
        for title_form in title_forms:
            form_name = get_bs_title_word_form(title_form).name.replace(
                '*', '')
            if form_name == abbreviation_name.lower():
                print(title_form)
                title_form_list.append(title_form)

        if title_form_list:
            bs_abbreviation_homonyms.append(abbreviation)
            bs_abbreviation_homonyms += title_form_list
            bs_abbreviation_homonyms.append('')

    save_list_to_file(bs_abbreviation_homonyms[:-1],
                      'out/Аббревиатура. БС. Омонимы.txt')
Beispiel #3
0
def get_capital_letter_bs():
    capital_letter_bs = list(
        get_string_list_from_file('out/Большая буква. БС.txt'))

    word_forms_bases = read_src_bs('src_dict/БС 13.03.21.txt')
    title_forms = [str(x.title_word_form) for x in word_forms_bases]

    capital_letter_bs_homonyms = []

    for capital_word in capital_letter_bs:
        capital_word_name = get_bs_title_word_form(capital_word).name
        title_form_list = []
        for title_form in title_forms:
            form_name = get_bs_title_word_form(title_form).name.replace(
                '*', '')
            if form_name == capital_word_name.lower():
                print(title_form)
                title_form_list.append(title_form)

        if title_form_list:
            capital_letter_bs_homonyms.append(capital_word)
            capital_letter_bs_homonyms += title_form_list
            capital_letter_bs_homonyms.append('')

    save_list_to_file(capital_letter_bs_homonyms[:-1],
                      'out/Большая буква. БС. Омонимы.txt')
Beispiel #4
0
def get_capital_letter_bg():
    capital_letter_bg = list(
        get_string_list_from_file('out/Большая буква. БГ.txt'))
    socket_group_list = list(read_src_socket_bs('src_dict/БГ 13.03.21.txt'))

    capital_letter_bg_homonyms = []

    for capital_word in capital_letter_bg:
        capital_word_name = get_socket_word_form(capital_word).name
        word_form_list = []
        for socket_group in socket_group_list:
            for sub_group in socket_group.sub_groups:
                for word_form in sub_group.socket_word_forms:
                    if not word_form.invisible:
                        form_name = word_form.name.replace('*', '')
                        if form_name == capital_word_name.lower():
                            print(word_form)
                            word_form_list.append(str(word_form))

        if word_form_list:
            capital_letter_bg_homonyms.append(capital_word)
            capital_letter_bg_homonyms += word_form_list
            capital_letter_bg_homonyms.append('')

    save_list_to_file(capital_letter_bg_homonyms[:-1],
                      'out/Большая буква. БГ. Омонимы.txt')
Beispiel #5
0
def remove_from_bg():
    """
    9. Для всех слов из док-та "Удалить из БГ.txt" сделать следующее:
    9.1. найти такое слово в док-те "БГ 27.01.21.txt" и,
    9.2.1. если оно является ЗС ПОДгруппы - вставить его
    в отдельно созданный док-т "ЗС подгруппы.txt"
    9.2.2. если оно НЕ является ЗС ПОДгруппы - удалить его
    из док-та "БГ 27.01.21.txt"
    """
    remove_bg_list = get_string_list_from_file('src_dict/Удалить из БГ.txt')
    socket_group_word_form_list = list(
        read_src_socket_bs('src_dict/БГ 27.01.21.txt'))

    title_form_list = []
    deleted_list = []

    for remove_bg in sorted(remove_bg_list):
        for group in socket_group_word_form_list:
            for sub_group in group.sub_groups:
                title_word_form = sub_group.title_word_form
                for word_form in sub_group.socket_word_forms[:]:
                    if remove_bg == word_form.name:
                        print(word_form)
                        if remove_bg == title_word_form.name:
                            title_form_list.append(str(word_form))
                        else:
                            deleted_list.append(str(word_form))
                            sub_group.socket_word_forms.remove(word_form)

    save_list_to_file(sorted(title_form_list, key=str.lower),
                      'out/ЗС подгруппы.txt')
    save_list_to_file(sorted(deleted_list, key=str.lower),
                      'out/удалённые из БГ.txt')
    save_socket_bs_dicts_to_txt(socket_group_word_form_list,
                                'out/БГ 28.01.21.txt')
Beispiel #6
0
def get_bg_abbreviation():
    socket_group_list = read_src_socket_bs('src_dict/БГ 09.03.21.txt')

    abbreviation_bg = []
    capital_letter_bg = []

    for socket_group in socket_group_list:
        for sub_group in socket_group.sub_groups:
            for word_form in sub_group.socket_word_forms:
                if not word_form.invisible:
                    form_name = word_form.name.replace('*', '')
                    chars = set(form_name[:2])
                    if all(map(lambda x: x in CYRILLIC, chars)):
                        abbreviation_bg.append(str(word_form))
                    elif form_name[0] in CYRILLIC:
                        capital_letter_bg.append(str(word_form))

    save_list_to_file(
        sorted(abbreviation_bg, key=lambda x: x.replace('*', '')),
        'out/Аббревиатура. БГ.txt'
    )

    save_list_to_file(
        sorted(capital_letter_bg, key=lambda x: x.replace('*', '')),
        'out/Большая буква. БГ.txt'
    )
Beispiel #7
0
def fix_adjusted_participles():
    socket_group_28_01 = list(get_string_list_from_file(
        'src_dict/БГ 28.01.21 изм.txt', encoding='cp1251'))
    socket_group_23_01 = list(get_string_list_from_file(
        'src_dict/БГ 23.01.21.txt', encoding='cp1251'))

    adjusted_participles_list = []

    for count, socket_string in enumerate(socket_group_28_01[:]):
        if socket_string.startswith('*'):
            for replace_string in socket_group_23_01[:]:
                if replace_string.startswith('*'):
                    if replace_string.split()[0].endswith(
                            socket_string.split()[1]
                    ):
                        print(replace_string)
                        socket_group_28_01[count] = replace_string
                        adjusted_participles_list.append(replace_string)

    save_list_to_file(sorted(adjusted_participles_list,
                             key=lambda x: x.replace('*', '').lower()),
                      'out/Адъектированные причастия.txt'
                      )
    save_list_to_file(socket_group_28_01, 'out/БГ 28.01.21.txt',
                      encoding='cp1251')
Beispiel #8
0
def get_homonymous_multi_rooted():
    multi_root_words = get_dicts_from_csv_file(
        'src_dict/Многокорневые слова.csv')

    multi_root_names = []
    for multi_root_word in multi_root_words:
        for root_index_key in list(multi_root_word):
            if multi_root_word[root_index_key]:
                multi_root_names.append(
                    get_socket_word_form(multi_root_word[root_index_key]).name)

    socket_group_list = list(read_src_socket_bs('src_dict/БГ 10.04.21.txt'))

    homonymous_multi_rooted = []

    for socket_group in socket_group_list:
        for sub_group in socket_group.sub_groups:
            title_word_form = sub_group.title_word_form
            for word_form in sub_group.socket_word_forms:
                if (not word_form.invisible and not word_form.root_index):
                    if word_form.name in multi_root_names:
                        print(word_form)
                        if str(word_form) == str(title_word_form):
                            homonymous_multi_rooted.append(str(word_form))
                        else:
                            homonymous_multi_rooted.append(' < '.join([
                                str(word_form),
                                str(title_word_form),
                            ]))

    homonymous_multi_rooted = sorted(
        homonymous_multi_rooted,
        key=lambda x: x.replace('*', '').strip().lower())
    save_list_to_file(homonymous_multi_rooted,
                      'out/Слова, омонимичные многокорневым словам.txt')
Beispiel #9
0
def get_two_in_one():
    remaining_repetitions = get_string_list_from_file('src_dict/Повторы ост.txt')
    word_forms_bases = read_src_bs('src_dict/БС 27.02.21.txt')
    bs_word_forms = [x.title_word_form for x in word_forms_bases]

    bs_word_names = [
        ' '.join(filter(None, [
            x.name,
            x.idf,
            ' '.join(x.info),
            x.note
        ]))
        for x in bs_word_forms
    ]

    relevant = []
    not_relevant = []

    for repeat in remaining_repetitions:
        if repeat in bs_word_names:
            relevant.append(repeat)
        else:
            not_relevant.append(repeat)

    save_list_to_file(relevant, 'out/Повторы ост. совпадает с БС.txt')
    save_list_to_file(not_relevant, 'out/Повторы ост. не совпадает с БС.txt')
Beispiel #10
0
def get_headwords():
    word_forms_bases = list(read_src_bs('src_dict/БС 09.03.21.txt'))

    headwords = [x.title_word_form for x in word_forms_bases]

    headwords_reruns = []

    for headword in headwords:
        print(headword)
        for group in word_forms_bases:
            if str(headword) != str(group.title_word_form):
                if group.word_forms:
                    word_forms = group.word_forms
                    word_form_names = [x.name for x in word_forms]
                    headword_name = headword.name.replace('*', '')
                    if headword_name in word_form_names:
                        title_form = group.title_word_form
                        headwords_reruns.append(str(headword))
                        headwords_reruns.append(str(title_form))
                        for word_form in word_forms:
                            if word_form.name == headword_name:
                                headwords_reruns.append(str(word_form))
                        headwords_reruns.append('')

    save_list_to_file(headwords_reruns, 'out/ЗС-повторы.txt')
Beispiel #11
0
def find_extra_lines(in_file, in_dir):
    files_path = [x for x in Path(in_dir).glob('*')]
    in_dir_lines = [x for x in get_file_lines(files_path)]
    in_lines = list(get_string_list_from_file(in_file))
    extra_lines = [x for x in in_dir_lines if x not in in_lines]
    if extra_lines:
        save_list_to_file(extra_lines, f'out/Лишние {Path(in_file).stem}.txt')
Beispiel #12
0
def ordinary_words_bs():
    word_forms_bases = read_src_bs('src_dict/БС 02.03.21.txt')
    bs_word_forms = [str(x.title_word_form) for x in word_forms_bases]

    exclusion_list = []

    # Многокорневые слова БС
    multi_root_bs_forms = get_string_list_from_file(
        'src_dict/Многокорневые слова БС.txt')
    exclusion_list += multi_root_bs_forms

    # Омонимы БС
    homonyms_bs = get_string_list_from_file('src_dict/Омонимы БС.txt')
    exclusion_list += homonyms_bs

    # Повторы ост. совпадает с БС
    remaining_repetitions = get_string_list_from_file(
        'src_dict/Повторы ост. совпадает с БС.txt')
    remaining_repetitions = [str(get_bs_title_word_form(x))
                             for x in remaining_repetitions]
    exclusion_list += remaining_repetitions

    # Обычные слова БС
    ordinary_words_bs_list = []

    for bs_str_form in bs_word_forms:
        if bs_str_form not in exclusion_list:
            ordinary_words_bs_list.append(bs_str_form)
            # print(bs_str_form)

    save_list_to_file(ordinary_words_bs_list, 'out/Обычные слова БС.txt')
Beispiel #13
0
def check_bg_islower():
    socket_group_list = read_src_socket_bs('src_dict/БГ 16.03.21 изм.txt')

    bg_islower_name_list = []
    bg_islower_note_list = []

    for socket_group in socket_group_list:
        for sub_group in socket_group.sub_groups:
            for word_form in sub_group.socket_word_forms:
                if not word_form.invisible:

                    if not word_form.name.islower():
                        print(word_form)
                        bg_islower_name_list.append(str(word_form))

                    note_form = ' '.join(
                        filter(None, [
                            word_form.note,
                            word_form.etml_note,
                            word_form.spec_note,
                        ]))

                    if any(map(lambda x: x in CYRILLIC, note_form)):
                        print(note_form)
                        bg_islower_note_list.append(str(word_form))

    save_list_to_file(bg_islower_name_list,
                      'out/БГ ЕСТЬ большие буквы СЛОВО.txt')
    save_list_to_file(bg_islower_note_list,
                      'out/БГ ЕСТЬ большие буквы ПРИМЕЧАНИЯ.txt')
Beispiel #14
0
def find_all_multi_rooted_words_from_bs():
    """
    15. Найти в док-те БС 06.04.21.txt все слова (ЗС групп и одиночки)
    из док-та Многокорневые слова.xlsx
    и создать список строк с такими словами - док-т Многокорневые слова БС.txt .
    Учитывая п.п. 1 и 2 Правил соотношения БГ и БС,
    сравнить каждую строку док-та Многокорневые слова БС.txt
    с каждой ячейкой со словом в док-те Многокорневые слова.xlsx .
    """

    multi_root_words = get_dicts_from_csv_file(
        'out/Многокорневые слова.csv')

    word_forms_bases = list(read_src_bs('src_dict/БС 06.04.21.txt'))

    multi_root_bg_forms = []

    for multi_root_word in multi_root_words:
        for root_index_key in list(multi_root_word)[1:]:
            if multi_root_word[root_index_key]:
                socket_form = get_socket_word_form(
                    multi_root_word[root_index_key]
                )
                multi_root_bg_forms.append(
                    ' '.join(filter(
                        None,
                        [
                            socket_form.name,
                            socket_form.idf,
                            ' '.join(socket_form.info),
                            socket_form.note.replace('* ', ''),
                        ])))

    multi_root_bs_forms = []

    for group_word_form in word_forms_bases:
        title_form = group_word_form.title_word_form
        src_title_form = ' '.join(filter(
            None,
            [
                title_form.name,
                title_form.idf,
                ' '.join(title_form.info),
                (title_form.note.replace('.* ', '')
                 if '<' not in title_form.note else None),
            ]))
        if src_title_form in multi_root_bg_forms:
            print(title_form)
            multi_root_bs_forms.append(str(title_form))

    multi_root_bs_forms = sorted(
        multi_root_bs_forms,
        key=lambda x: x.replace('*', '').lower().strip()
    )

    save_list_to_file(multi_root_bs_forms, 'out/Многокорневые слова БС.txt')
Beispiel #15
0
def change_case():
    for file_path in Path('src_dict/lst').glob('*'):
        file_stem = file_path.stem
        lower_words = []
        words = get_string_list_from_file(file_path, encoding='cp1251')
        for word in words:
            if word:
                lower_words.append(word.lower())
        out_path = f'out/lst/{file_stem}.txt'
        save_list_to_file(lower_words, out_path, encoding='cp1251')
Beispiel #16
0
def get_homonyms_bg():
    socket_group_list = list(read_src_socket_bs(
        'src_dict/БГ 16.02.21.txt'))

    socket_names = []

    for socket_group in socket_group_list:
        for sub_group in socket_group.sub_groups:
            for word_form in sub_group.socket_word_forms:
                if (
                        not word_form.invisible
                        and not word_form.root_index
                ):
                    socket_names.append(
                        word_form.name.replace('*', '').strip()
                    )

    socket_names = [x for x, y in Counter(socket_names).items() if y > 1]
    socket_names = sorted(list(set(socket_names)))

    homonyms = []

    for socket_group in socket_group_list:
        group_names = [
            x.name.replace('*', '').strip()
            for x in socket_group.socket_word_forms if not x.invisible
        ]

        for sub_group in socket_group.sub_groups:
            title_word_form = sub_group.title_word_form
            for word_form in sub_group.socket_word_forms:
                if (
                        not word_form.invisible
                        and not word_form.root_index
                ):
                    raw_name = word_form.name.replace('*', '').strip()
                    if (
                            group_names.count(raw_name) == 1
                            and raw_name in socket_names):
                        if str(word_form) == str(title_word_form):
                            homonyms.append(str(word_form))
                        else:
                            homonyms.append(' < '.join([
                                str(word_form),
                                str(title_word_form),
                            ]))

    sort_homonyms = sorted(homonyms,
                           key=lambda x: x.replace('*', '').strip().lower())
    save_list_to_file(sort_homonyms, 'out/Омонимы БГ.txt')

    loner_names = [x.split()[0] for x in sort_homonyms]
    loner_homonyms = [x for x in sort_homonyms
                      if loner_names.count(x.split()[0]) == 1]
    save_list_to_file(loner_homonyms, 'out/Единичные омонимы.txt')
Beispiel #17
0
def get_remaining_homonyms():
    remaining_homonyms = list(
        get_string_list_from_file(
            'src_dict/О-мы БС спец. прим. не совпадают с Повторами.txt'))
    remaining_homonyms += list(
        get_string_list_from_file(
            'src_dict/О-мы БС не совпадают с Повторами.txt'))
    save_list_to_file(
        sorted(remaining_homonyms,
               key=lambda x: x.replace('*', '').strip().lower()),
        'out/О-мы БС ост.txt')
Beispiel #18
0
def compare_homonyms_spec_note():
    homonyms_bg = get_string_list_from_file('src_dict/Омонимы БГ.txt')

    homonyms_bg_str_form = []

    for homonyms in homonyms_bg:
        socket_form = get_socket_word_form(homonyms)

        spec_note = socket_form.spec_note.replace('< ', '')
        spec_note_socket_form = get_socket_word_form(spec_note)
        spec_note = ' '.join(
            filter(None, [
                spec_note_socket_form.invisible,
                spec_note_socket_form.name,
                spec_note_socket_form.root_index,
                spec_note_socket_form.idf,
                ' '.join(spec_note_socket_form.info),
                spec_note_socket_form.note,
            ]))

        string_form = ' '.join(
            filter(None, [
                socket_form.name,
                socket_form.idf,
                ' '.join(socket_form.info),
                spec_note,
            ]))
        homonyms_bg_str_form.append(string_form)

    homonyms_spec_note = get_string_list_from_file(
        'src_dict/О-мы БС спец. прим. не совпадают с Повторами.txt')

    homonyms_spec_note_relevant = []
    homonyms_spec_note_not_relevant = []

    for homonym in homonyms_spec_note:
        title_form = get_bs_title_word_form(homonym)
        print(title_form.name, title_form.note)
        string_form = ' '.join(
            filter(None, [
                title_form.name,
                title_form.idf,
                ' '.join(title_form.info),
                title_form.note.replace('.* < ', ''),
            ]))
        if string_form in homonyms_bg_str_form:
            homonyms_spec_note_relevant.append(homonym)
        else:
            homonyms_spec_note_not_relevant.append(homonym)

    save_list_to_file(homonyms_spec_note_relevant,
                      'out/О-мы БС спец. прим. совпадают с О-мами БГ.txt')
    save_list_to_file(homonyms_spec_note_not_relevant,
                      'out/О-мы БС спец. прим. не совпадают с О-мами БГ.txt')
Beispiel #19
0
def get_spec_note():
    word_forms_bases = list(read_src_bs('src_dict/БС 23.02.21.txt'))

    spec_note_bs = []

    for title_form in [x.title_word_form for x in word_forms_bases]:
        title_note = title_form.note
        if title_note:
            if title_note.startswith('.* <'):
                if ' ' not in title_note[5:]:
                    spec_note_bs.append(str(title_form))

    save_list_to_file(spec_note_bs, 'out/Спец. прим. БС. 1 слово.txt')
Beispiel #20
0
def headwords_test():
    with open(Path('out/ЗС-повторы_001.txt'), encoding='utf-8') as f_in:
        headwords_string = f_in.read()

    headwords_reruns = []

    headwords_groups = [x for x in headwords_string.split('\n\n\n')]
    for group in headwords_groups:
        lines = group.split('\n')
        if lines[0] != lines[1]:
            headwords_reruns.append(group + '\n')

    save_list_to_file(headwords_reruns, 'out/ЗС-повторы.txt')
Beispiel #21
0
def get_bg_note():
    socket_group_list = read_src_socket_bs('src_dict/БГ 17.02.21.txt')

    bg_notes = []

    for socket_group in socket_group_list:
        for sub_group in socket_group.sub_groups:
            for word_form in sub_group.socket_word_forms[1:]:
                if word_form.note:
                    print(word_form)
                    bg_notes.append(str(word_form))

    save_list_to_file(bg_notes, 'out/Пояснительные примечания.txt')
Beispiel #22
0
def get_adjusted_participles_bs():
    word_forms_bases = read_src_bs('src_dict/БС 28.01.21.txt')

    adjusted_participles_list = []

    for group_word_form in word_forms_bases:
        title_word_form = group_word_form.title_word_form
        if title_word_form.name.startswith('*'):
            print(title_word_form)
            adjusted_participles_list.append(str(title_word_form))

    save_list_to_file(adjusted_participles_list,
                      'out/Адъектированные причастия БС.txt')
Beispiel #23
0
def get_bs_names():
    word_forms_bases = list(read_src_bs('src_dict/БС 09.03.21.txt'))

    bs_names = []

    for group in word_forms_bases:
        if group.word_forms:
            word_forms = group.word_forms
            word_form_names = [x.name.replace('*', '') for x in word_forms]
            bs_names += word_form_names

    bs_names = sorted(list(set(bs_names)), key=str.lower)

    save_list_to_file(bs_names, 'out/bs_names.txt')
Beispiel #24
0
def check_g58():
    endings = ('греть', 'мять', 'оть', 'ыть', 'пеленать')

    g58_list = get_string_list_from_file('src_dict/Г58 ещё.txt')
    word_forms = [get_bs_title_word_form(x) for x in g58_list]

    out_list = []

    for word_form in word_forms[:]:
        if not word_form.name.endswith(endings):
            print(word_form)
            out_list.append(str(word_form))

    save_list_to_file(sorted(out_list), 'out/Г58 ещё изм.txt')
Beispiel #25
0
def get_no_full_form():
    word_forms_bases = read_src_bs('src_dict/БС 24.03.21.txt')

    no_full_form_list = []

    for group in word_forms_bases:
        title_form = group.title_word_form
        info_list = title_form.info
        if info_list and title_form.idf.startswith('.П'):
            if info_list[0].startswith(('К', 'С', 'П')):
                print(title_form)
                no_full_form_list.append(str(title_form))

    save_list_to_file(no_full_form_list, 'out/НЕТ полной формы.txt')
Beispiel #26
0
def get_bg_note():
    socket_group_list = read_src_socket_bs('src_dict/БГ 23.02.21.txt')

    bg_notes = []

    for socket_group in socket_group_list:
        for sub_group in socket_group.sub_groups:
            for word_form in sub_group.socket_word_forms[1:]:
                if word_form.note:
                    print(word_form)
                    bg_notes.append(str(word_form))

    save_list_to_file(
        sorted(bg_notes, key=lambda x: x.replace('*', '').strip().lower()),
        'out/Пояснительные примечания БГ, не-ЗС.txt')
Beispiel #27
0
def get_replays_in_groups():
    replays_in_groups = []

    with open('src_dict/Повторы в группах.txt', encoding='utf-8') as f_in:
        groups = (x.strip() for x in f_in.read().split('\n\n'))
        for group in groups:
            for line in group.split('\n')[1:]:
                if not line.startswith('!'):
                    replays_in_groups.append(line)

    replays_in_groups = sorted(
        list(set(replays_in_groups)),
        key=lambda x: x.replace('*', '').strip().lower())

    save_list_to_file(replays_in_groups,
                      'out/Повторы в группах (без повторов).txt')