def link_form(form, query_voc, main_civil): civil = civilrus_convert(form).lower() civil_without_er = civil.replace('ъ', '') civil_without_erj = civil_without_er.replace('ь', '') if civil: query_voc[civil].union(query_voc[main_civil]) if civil_without_er != civil: query_voc[civil_without_er].union(query_voc[main_civil]) if civil_without_erj != civil and civil_without_erj != civil_without_er: query_voc[civil_without_erj].union(query_voc[main_civil])
def add_form(form, entry, data): civil = civilrus_convert(form).lower() civil_without_er = civil.replace('ъ', '') civil_without_erj = civil_without_er.replace('ь', '') if civil: data[civil].append(entry) if civil_without_er != civil: data[civil_without_er].append(entry) if civil_without_erj != civil and civil_without_erj != civil_without_er: data[civil_without_erj].append(entry)
def get_reference_hint(wordform, lexeme, without_translit=True, with_ref=True): wordform = resolve_titles(wordform) hint = { KEY_ENTRY: ucs_convert(wordform), } if not without_translit: hint[KEY_CIVIL] = civilrus_convert(wordform) if with_ref: if isinstance(lexeme, Entry): hint[KEY_REFEREE] = get_hint(lexeme) else: referenced_lexemes = lexeme['referenced_lexemes'] referee_hint = get_hint(referenced_lexemes[0]) if (len(referenced_lexemes) > 1 and all(e.homonym_order for e in referenced_lexemes)): referee_hint[KEY_HOMONYM_ORDER] = u',\u00a0'.join( str(e.homonym_order) for e in referenced_lexemes if e) hint[KEY_REFEREE] = referee_hint return hint
def get_reference_hint(wordform, lexeme, without_translit=True, with_ref=True, with_rnc=False): wordform = resolve_titles(wordform) hint = { KEY_ENTRY: ucs_convert(wordform), } if not without_translit: hint[KEY_CIVIL] = civilrus_convert(wordform) if with_ref: if isinstance(lexeme, Entry): hint[KEY_REFEREE] = get_hint(lexeme, with_rnc=with_rnc) else: referenced_lexemes = lexeme['referenced_lexemes'] referee_hint = get_hint(referenced_lexemes[0], with_rnc=with_rnc) if (len(referenced_lexemes) > 1 and all(e.homonym_order for e in referenced_lexemes)): referee_hint[KEY_HOMONYM_ORDER] = ',\u00a0'.join( str(e.homonym_order) for e in referenced_lexemes if e) hint[KEY_REFEREE] = referee_hint return hint
def write_collection(collection, filename): filepath = os.path.join(path, filename) with open(filepath, 'w') as f: for item in sorted(collection, key=lambda x: civilrus_convert(str(x))): f.write(str(item)) f.write('\n')
def __lt__(self, other): return civilrus_convert(self.lex) < civilrus_convert(other.lex)
with open(os.path.join(path, 'test_stat.txt'), 'w') as f: f.write('\n') for line in lines: print(line) f.write(line + '\n') with open(os.path.join(path, 'vocabulary.csv'), 'w') as f: f.write(','.join('"%s"' % title for title in [ 'Заглавное слово', 'Гражданское написание', 'Список словоформ', 'Запрос для АнтКонка', 'Авторы', 'Комментарий к статье', 'Номер омонима', 'Смыслоразличительный ярлык омонима', 'Является ли дубликатом' ])) f.write('\n') last_homonym_number = 0 _lemmas = list(sorted(lemmas, key=lambda L: civilrus_convert(L.lex))) N = len(_lemmas) for i, lex in enumerate(_lemmas): lemma = lex.lex civil = civilrus_convert(lemma) next_civil = '' if i + 1 == N else civilrus_convert(_lemmas[i + 1].lex) wordforms = get_wordforms(lex) query = get_query(lex) author = '' comment = lex.pos if civil == next_civil: last_homonym_number += 1 homonym_number = last_homonym_number else: if last_homonym_number > 0: last_homonym_number += 1
return instance if not entries3: note = 'В словарной базе нету статей, удовлетворящих условиям выгрузки\n' note += 'Ни одной статьи и ни одного типа индекса выгружено не будет.\n' sys.stderr.write(note) print(SHOW_CURSOR, file=sys.stderr) sys.exit(0) # Объединение статей по начальным буквам letter_parts = [] part_entries = [] first_letter = entries3[0][0].lstrip(' =*')[0] civil_letter = civilrus_convert(first_letter.lower()) csl_letter = first_letter.upper() syn_letters = [csl_letter] entries3_n = len(entries3) it = enumerate(itertools.groupby(entries3, lambda x: x[0])) for j, (wordform, group) in it: note = 'Группировка статей по начальным буквам [ %s%% ]%s\r' % ( int(round(j / entries3_n * 100)), ERASE_LINEEND) sys.stderr.write(note) lst = list(group) first_letter = wordform.lstrip(' =*')[0].lower() csl_letter = first_letter.upper() if civilrus_convert(first_letter) != civil_letter: syn_letters.sort(key=sort_key2) letter_parts.append((civil_letter, syn_letters, part_entries)) part_entries = []
file=sys.stderr) print(file=sys.stderr) return entries voc = load_data(filepath) entries = get_entries_index() #write_diff(voc, entries) query_voc = get_query_voc(voc, entries) orphans = [] N = Entry.objects.count() for i, entry in enumerate(Entry.objects.all()): if not entry.antconc_query.strip(): for form in get_forms(entry): civil = re.sub('[ъь]', '', civilrus_convert(form).strip()) if civil in query_voc: orterms = [] for item in query_voc[civil]: orterms.extend(get_query_orterms(item)) value = make_query_from_orterms(orterms) if value: entry.antconc_query = value entry.save() break else: orphans.append(entry) print('Patching articles with no query [ %i%% ]' % ((i + 1) / N * 100), end='\r', file=sys.stderr) print(file=sys.stderr)
def in_output_volumes(wordform): civil = civilrus_convert(resolve_titles(wordform.strip(' *'))) return civil[:1].lower() in OUTPUT_VOLUMES_LETTERS