def get_bg_abbreviation_homonyms(): abbreviation_bg = get_string_list_from_file('out/Аббревиатура. БГ.txt') abbreviation_bg = sorted(list(set(abbreviation_bg))) socket_group_list = list(read_src_socket_bs('src_dict/БГ 13.03.21.txt')) bg_abbreviation_homonyms = [] for abbreviation in abbreviation_bg: abbreviation_name = get_socket_word_form(abbreviation).name word_form_list = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms: if not word_form.invisible: form_name = word_form.name.replace('*', '') if form_name == abbreviation_name.lower(): print(form_name) word_form_list.append(str(word_form)) if word_form_list: bg_abbreviation_homonyms.append(abbreviation) bg_abbreviation_homonyms += word_form_list bg_abbreviation_homonyms.append('') save_list_to_file(bg_abbreviation_homonyms[:-1], 'out/Аббревиатура. БГ. Омонимы.txt')
def get_bs_abbreviation_homonyms(): abbreviation_bs = get_string_list_from_file('out/Аббревиатура. БС.txt') word_forms_bases = read_src_bs('src_dict/БС 13.03.21.txt') title_forms = [str(x.title_word_form) for x in word_forms_bases] bs_abbreviation_homonyms = [] for abbreviation in abbreviation_bs: abbreviation_name = get_bs_title_word_form(abbreviation).name title_form_list = [] for title_form in title_forms: form_name = get_bs_title_word_form(title_form).name.replace( '*', '') if form_name == abbreviation_name.lower(): print(title_form) title_form_list.append(title_form) if title_form_list: bs_abbreviation_homonyms.append(abbreviation) bs_abbreviation_homonyms += title_form_list bs_abbreviation_homonyms.append('') save_list_to_file(bs_abbreviation_homonyms[:-1], 'out/Аббревиатура. БС. Омонимы.txt')
def get_capital_letter_bs(): capital_letter_bs = list( get_string_list_from_file('out/Большая буква. БС.txt')) word_forms_bases = read_src_bs('src_dict/БС 13.03.21.txt') title_forms = [str(x.title_word_form) for x in word_forms_bases] capital_letter_bs_homonyms = [] for capital_word in capital_letter_bs: capital_word_name = get_bs_title_word_form(capital_word).name title_form_list = [] for title_form in title_forms: form_name = get_bs_title_word_form(title_form).name.replace( '*', '') if form_name == capital_word_name.lower(): print(title_form) title_form_list.append(title_form) if title_form_list: capital_letter_bs_homonyms.append(capital_word) capital_letter_bs_homonyms += title_form_list capital_letter_bs_homonyms.append('') save_list_to_file(capital_letter_bs_homonyms[:-1], 'out/Большая буква. БС. Омонимы.txt')
def get_capital_letter_bg(): capital_letter_bg = list( get_string_list_from_file('out/Большая буква. БГ.txt')) socket_group_list = list(read_src_socket_bs('src_dict/БГ 13.03.21.txt')) capital_letter_bg_homonyms = [] for capital_word in capital_letter_bg: capital_word_name = get_socket_word_form(capital_word).name word_form_list = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms: if not word_form.invisible: form_name = word_form.name.replace('*', '') if form_name == capital_word_name.lower(): print(word_form) word_form_list.append(str(word_form)) if word_form_list: capital_letter_bg_homonyms.append(capital_word) capital_letter_bg_homonyms += word_form_list capital_letter_bg_homonyms.append('') save_list_to_file(capital_letter_bg_homonyms[:-1], 'out/Большая буква. БГ. Омонимы.txt')
def remove_from_bg(): """ 9. Для всех слов из док-та "Удалить из БГ.txt" сделать следующее: 9.1. найти такое слово в док-те "БГ 27.01.21.txt" и, 9.2.1. если оно является ЗС ПОДгруппы - вставить его в отдельно созданный док-т "ЗС подгруппы.txt" 9.2.2. если оно НЕ является ЗС ПОДгруппы - удалить его из док-та "БГ 27.01.21.txt" """ remove_bg_list = get_string_list_from_file('src_dict/Удалить из БГ.txt') socket_group_word_form_list = list( read_src_socket_bs('src_dict/БГ 27.01.21.txt')) title_form_list = [] deleted_list = [] for remove_bg in sorted(remove_bg_list): for group in socket_group_word_form_list: for sub_group in group.sub_groups: title_word_form = sub_group.title_word_form for word_form in sub_group.socket_word_forms[:]: if remove_bg == word_form.name: print(word_form) if remove_bg == title_word_form.name: title_form_list.append(str(word_form)) else: deleted_list.append(str(word_form)) sub_group.socket_word_forms.remove(word_form) save_list_to_file(sorted(title_form_list, key=str.lower), 'out/ЗС подгруппы.txt') save_list_to_file(sorted(deleted_list, key=str.lower), 'out/удалённые из БГ.txt') save_socket_bs_dicts_to_txt(socket_group_word_form_list, 'out/БГ 28.01.21.txt')
def get_bg_abbreviation(): socket_group_list = read_src_socket_bs('src_dict/БГ 09.03.21.txt') abbreviation_bg = [] capital_letter_bg = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms: if not word_form.invisible: form_name = word_form.name.replace('*', '') chars = set(form_name[:2]) if all(map(lambda x: x in CYRILLIC, chars)): abbreviation_bg.append(str(word_form)) elif form_name[0] in CYRILLIC: capital_letter_bg.append(str(word_form)) save_list_to_file( sorted(abbreviation_bg, key=lambda x: x.replace('*', '')), 'out/Аббревиатура. БГ.txt' ) save_list_to_file( sorted(capital_letter_bg, key=lambda x: x.replace('*', '')), 'out/Большая буква. БГ.txt' )
def fix_adjusted_participles(): socket_group_28_01 = list(get_string_list_from_file( 'src_dict/БГ 28.01.21 изм.txt', encoding='cp1251')) socket_group_23_01 = list(get_string_list_from_file( 'src_dict/БГ 23.01.21.txt', encoding='cp1251')) adjusted_participles_list = [] for count, socket_string in enumerate(socket_group_28_01[:]): if socket_string.startswith('*'): for replace_string in socket_group_23_01[:]: if replace_string.startswith('*'): if replace_string.split()[0].endswith( socket_string.split()[1] ): print(replace_string) socket_group_28_01[count] = replace_string adjusted_participles_list.append(replace_string) save_list_to_file(sorted(adjusted_participles_list, key=lambda x: x.replace('*', '').lower()), 'out/Адъектированные причастия.txt' ) save_list_to_file(socket_group_28_01, 'out/БГ 28.01.21.txt', encoding='cp1251')
def get_homonymous_multi_rooted(): multi_root_words = get_dicts_from_csv_file( 'src_dict/Многокорневые слова.csv') multi_root_names = [] for multi_root_word in multi_root_words: for root_index_key in list(multi_root_word): if multi_root_word[root_index_key]: multi_root_names.append( get_socket_word_form(multi_root_word[root_index_key]).name) socket_group_list = list(read_src_socket_bs('src_dict/БГ 10.04.21.txt')) homonymous_multi_rooted = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: title_word_form = sub_group.title_word_form for word_form in sub_group.socket_word_forms: if (not word_form.invisible and not word_form.root_index): if word_form.name in multi_root_names: print(word_form) if str(word_form) == str(title_word_form): homonymous_multi_rooted.append(str(word_form)) else: homonymous_multi_rooted.append(' < '.join([ str(word_form), str(title_word_form), ])) homonymous_multi_rooted = sorted( homonymous_multi_rooted, key=lambda x: x.replace('*', '').strip().lower()) save_list_to_file(homonymous_multi_rooted, 'out/Слова, омонимичные многокорневым словам.txt')
def get_two_in_one(): remaining_repetitions = get_string_list_from_file('src_dict/Повторы ост.txt') word_forms_bases = read_src_bs('src_dict/БС 27.02.21.txt') bs_word_forms = [x.title_word_form for x in word_forms_bases] bs_word_names = [ ' '.join(filter(None, [ x.name, x.idf, ' '.join(x.info), x.note ])) for x in bs_word_forms ] relevant = [] not_relevant = [] for repeat in remaining_repetitions: if repeat in bs_word_names: relevant.append(repeat) else: not_relevant.append(repeat) save_list_to_file(relevant, 'out/Повторы ост. совпадает с БС.txt') save_list_to_file(not_relevant, 'out/Повторы ост. не совпадает с БС.txt')
def get_headwords(): word_forms_bases = list(read_src_bs('src_dict/БС 09.03.21.txt')) headwords = [x.title_word_form for x in word_forms_bases] headwords_reruns = [] for headword in headwords: print(headword) for group in word_forms_bases: if str(headword) != str(group.title_word_form): if group.word_forms: word_forms = group.word_forms word_form_names = [x.name for x in word_forms] headword_name = headword.name.replace('*', '') if headword_name in word_form_names: title_form = group.title_word_form headwords_reruns.append(str(headword)) headwords_reruns.append(str(title_form)) for word_form in word_forms: if word_form.name == headword_name: headwords_reruns.append(str(word_form)) headwords_reruns.append('') save_list_to_file(headwords_reruns, 'out/ЗС-повторы.txt')
def find_extra_lines(in_file, in_dir): files_path = [x for x in Path(in_dir).glob('*')] in_dir_lines = [x for x in get_file_lines(files_path)] in_lines = list(get_string_list_from_file(in_file)) extra_lines = [x for x in in_dir_lines if x not in in_lines] if extra_lines: save_list_to_file(extra_lines, f'out/Лишние {Path(in_file).stem}.txt')
def ordinary_words_bs(): word_forms_bases = read_src_bs('src_dict/БС 02.03.21.txt') bs_word_forms = [str(x.title_word_form) for x in word_forms_bases] exclusion_list = [] # Многокорневые слова БС multi_root_bs_forms = get_string_list_from_file( 'src_dict/Многокорневые слова БС.txt') exclusion_list += multi_root_bs_forms # Омонимы БС homonyms_bs = get_string_list_from_file('src_dict/Омонимы БС.txt') exclusion_list += homonyms_bs # Повторы ост. совпадает с БС remaining_repetitions = get_string_list_from_file( 'src_dict/Повторы ост. совпадает с БС.txt') remaining_repetitions = [str(get_bs_title_word_form(x)) for x in remaining_repetitions] exclusion_list += remaining_repetitions # Обычные слова БС ordinary_words_bs_list = [] for bs_str_form in bs_word_forms: if bs_str_form not in exclusion_list: ordinary_words_bs_list.append(bs_str_form) # print(bs_str_form) save_list_to_file(ordinary_words_bs_list, 'out/Обычные слова БС.txt')
def check_bg_islower(): socket_group_list = read_src_socket_bs('src_dict/БГ 16.03.21 изм.txt') bg_islower_name_list = [] bg_islower_note_list = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms: if not word_form.invisible: if not word_form.name.islower(): print(word_form) bg_islower_name_list.append(str(word_form)) note_form = ' '.join( filter(None, [ word_form.note, word_form.etml_note, word_form.spec_note, ])) if any(map(lambda x: x in CYRILLIC, note_form)): print(note_form) bg_islower_note_list.append(str(word_form)) save_list_to_file(bg_islower_name_list, 'out/БГ ЕСТЬ большие буквы СЛОВО.txt') save_list_to_file(bg_islower_note_list, 'out/БГ ЕСТЬ большие буквы ПРИМЕЧАНИЯ.txt')
def find_all_multi_rooted_words_from_bs(): """ 15. Найти в док-те БС 06.04.21.txt все слова (ЗС групп и одиночки) из док-та Многокорневые слова.xlsx и создать список строк с такими словами - док-т Многокорневые слова БС.txt . Учитывая п.п. 1 и 2 Правил соотношения БГ и БС, сравнить каждую строку док-та Многокорневые слова БС.txt с каждой ячейкой со словом в док-те Многокорневые слова.xlsx . """ multi_root_words = get_dicts_from_csv_file( 'out/Многокорневые слова.csv') word_forms_bases = list(read_src_bs('src_dict/БС 06.04.21.txt')) multi_root_bg_forms = [] for multi_root_word in multi_root_words: for root_index_key in list(multi_root_word)[1:]: if multi_root_word[root_index_key]: socket_form = get_socket_word_form( multi_root_word[root_index_key] ) multi_root_bg_forms.append( ' '.join(filter( None, [ socket_form.name, socket_form.idf, ' '.join(socket_form.info), socket_form.note.replace('* ', ''), ]))) multi_root_bs_forms = [] for group_word_form in word_forms_bases: title_form = group_word_form.title_word_form src_title_form = ' '.join(filter( None, [ title_form.name, title_form.idf, ' '.join(title_form.info), (title_form.note.replace('.* ', '') if '<' not in title_form.note else None), ])) if src_title_form in multi_root_bg_forms: print(title_form) multi_root_bs_forms.append(str(title_form)) multi_root_bs_forms = sorted( multi_root_bs_forms, key=lambda x: x.replace('*', '').lower().strip() ) save_list_to_file(multi_root_bs_forms, 'out/Многокорневые слова БС.txt')
def change_case(): for file_path in Path('src_dict/lst').glob('*'): file_stem = file_path.stem lower_words = [] words = get_string_list_from_file(file_path, encoding='cp1251') for word in words: if word: lower_words.append(word.lower()) out_path = f'out/lst/{file_stem}.txt' save_list_to_file(lower_words, out_path, encoding='cp1251')
def get_homonyms_bg(): socket_group_list = list(read_src_socket_bs( 'src_dict/БГ 16.02.21.txt')) socket_names = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms: if ( not word_form.invisible and not word_form.root_index ): socket_names.append( word_form.name.replace('*', '').strip() ) socket_names = [x for x, y in Counter(socket_names).items() if y > 1] socket_names = sorted(list(set(socket_names))) homonyms = [] for socket_group in socket_group_list: group_names = [ x.name.replace('*', '').strip() for x in socket_group.socket_word_forms if not x.invisible ] for sub_group in socket_group.sub_groups: title_word_form = sub_group.title_word_form for word_form in sub_group.socket_word_forms: if ( not word_form.invisible and not word_form.root_index ): raw_name = word_form.name.replace('*', '').strip() if ( group_names.count(raw_name) == 1 and raw_name in socket_names): if str(word_form) == str(title_word_form): homonyms.append(str(word_form)) else: homonyms.append(' < '.join([ str(word_form), str(title_word_form), ])) sort_homonyms = sorted(homonyms, key=lambda x: x.replace('*', '').strip().lower()) save_list_to_file(sort_homonyms, 'out/Омонимы БГ.txt') loner_names = [x.split()[0] for x in sort_homonyms] loner_homonyms = [x for x in sort_homonyms if loner_names.count(x.split()[0]) == 1] save_list_to_file(loner_homonyms, 'out/Единичные омонимы.txt')
def get_remaining_homonyms(): remaining_homonyms = list( get_string_list_from_file( 'src_dict/О-мы БС спец. прим. не совпадают с Повторами.txt')) remaining_homonyms += list( get_string_list_from_file( 'src_dict/О-мы БС не совпадают с Повторами.txt')) save_list_to_file( sorted(remaining_homonyms, key=lambda x: x.replace('*', '').strip().lower()), 'out/О-мы БС ост.txt')
def compare_homonyms_spec_note(): homonyms_bg = get_string_list_from_file('src_dict/Омонимы БГ.txt') homonyms_bg_str_form = [] for homonyms in homonyms_bg: socket_form = get_socket_word_form(homonyms) spec_note = socket_form.spec_note.replace('< ', '') spec_note_socket_form = get_socket_word_form(spec_note) spec_note = ' '.join( filter(None, [ spec_note_socket_form.invisible, spec_note_socket_form.name, spec_note_socket_form.root_index, spec_note_socket_form.idf, ' '.join(spec_note_socket_form.info), spec_note_socket_form.note, ])) string_form = ' '.join( filter(None, [ socket_form.name, socket_form.idf, ' '.join(socket_form.info), spec_note, ])) homonyms_bg_str_form.append(string_form) homonyms_spec_note = get_string_list_from_file( 'src_dict/О-мы БС спец. прим. не совпадают с Повторами.txt') homonyms_spec_note_relevant = [] homonyms_spec_note_not_relevant = [] for homonym in homonyms_spec_note: title_form = get_bs_title_word_form(homonym) print(title_form.name, title_form.note) string_form = ' '.join( filter(None, [ title_form.name, title_form.idf, ' '.join(title_form.info), title_form.note.replace('.* < ', ''), ])) if string_form in homonyms_bg_str_form: homonyms_spec_note_relevant.append(homonym) else: homonyms_spec_note_not_relevant.append(homonym) save_list_to_file(homonyms_spec_note_relevant, 'out/О-мы БС спец. прим. совпадают с О-мами БГ.txt') save_list_to_file(homonyms_spec_note_not_relevant, 'out/О-мы БС спец. прим. не совпадают с О-мами БГ.txt')
def get_spec_note(): word_forms_bases = list(read_src_bs('src_dict/БС 23.02.21.txt')) spec_note_bs = [] for title_form in [x.title_word_form for x in word_forms_bases]: title_note = title_form.note if title_note: if title_note.startswith('.* <'): if ' ' not in title_note[5:]: spec_note_bs.append(str(title_form)) save_list_to_file(spec_note_bs, 'out/Спец. прим. БС. 1 слово.txt')
def headwords_test(): with open(Path('out/ЗС-повторы_001.txt'), encoding='utf-8') as f_in: headwords_string = f_in.read() headwords_reruns = [] headwords_groups = [x for x in headwords_string.split('\n\n\n')] for group in headwords_groups: lines = group.split('\n') if lines[0] != lines[1]: headwords_reruns.append(group + '\n') save_list_to_file(headwords_reruns, 'out/ЗС-повторы.txt')
def get_bg_note(): socket_group_list = read_src_socket_bs('src_dict/БГ 17.02.21.txt') bg_notes = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms[1:]: if word_form.note: print(word_form) bg_notes.append(str(word_form)) save_list_to_file(bg_notes, 'out/Пояснительные примечания.txt')
def get_adjusted_participles_bs(): word_forms_bases = read_src_bs('src_dict/БС 28.01.21.txt') adjusted_participles_list = [] for group_word_form in word_forms_bases: title_word_form = group_word_form.title_word_form if title_word_form.name.startswith('*'): print(title_word_form) adjusted_participles_list.append(str(title_word_form)) save_list_to_file(adjusted_participles_list, 'out/Адъектированные причастия БС.txt')
def get_bs_names(): word_forms_bases = list(read_src_bs('src_dict/БС 09.03.21.txt')) bs_names = [] for group in word_forms_bases: if group.word_forms: word_forms = group.word_forms word_form_names = [x.name.replace('*', '') for x in word_forms] bs_names += word_form_names bs_names = sorted(list(set(bs_names)), key=str.lower) save_list_to_file(bs_names, 'out/bs_names.txt')
def check_g58(): endings = ('греть', 'мять', 'оть', 'ыть', 'пеленать') g58_list = get_string_list_from_file('src_dict/Г58 ещё.txt') word_forms = [get_bs_title_word_form(x) for x in g58_list] out_list = [] for word_form in word_forms[:]: if not word_form.name.endswith(endings): print(word_form) out_list.append(str(word_form)) save_list_to_file(sorted(out_list), 'out/Г58 ещё изм.txt')
def get_no_full_form(): word_forms_bases = read_src_bs('src_dict/БС 24.03.21.txt') no_full_form_list = [] for group in word_forms_bases: title_form = group.title_word_form info_list = title_form.info if info_list and title_form.idf.startswith('.П'): if info_list[0].startswith(('К', 'С', 'П')): print(title_form) no_full_form_list.append(str(title_form)) save_list_to_file(no_full_form_list, 'out/НЕТ полной формы.txt')
def get_bg_note(): socket_group_list = read_src_socket_bs('src_dict/БГ 23.02.21.txt') bg_notes = [] for socket_group in socket_group_list: for sub_group in socket_group.sub_groups: for word_form in sub_group.socket_word_forms[1:]: if word_form.note: print(word_form) bg_notes.append(str(word_form)) save_list_to_file( sorted(bg_notes, key=lambda x: x.replace('*', '').strip().lower()), 'out/Пояснительные примечания БГ, не-ЗС.txt')
def get_replays_in_groups(): replays_in_groups = [] with open('src_dict/Повторы в группах.txt', encoding='utf-8') as f_in: groups = (x.strip() for x in f_in.read().split('\n\n')) for group in groups: for line in group.split('\n')[1:]: if not line.startswith('!'): replays_in_groups.append(line) replays_in_groups = sorted( list(set(replays_in_groups)), key=lambda x: x.replace('*', '').strip().lower()) save_list_to_file(replays_in_groups, 'out/Повторы в группах (без повторов).txt')