Ejemplo n.º 1
0
def link_form(form, query_voc, main_civil):
    civil = civilrus_convert(form).lower()
    civil_without_er = civil.replace('ъ', '')
    civil_without_erj = civil_without_er.replace('ь', '')
    if civil:
        query_voc[civil].union(query_voc[main_civil])
    if civil_without_er != civil:
        query_voc[civil_without_er].union(query_voc[main_civil])
    if civil_without_erj != civil and civil_without_erj != civil_without_er:
        query_voc[civil_without_erj].union(query_voc[main_civil])
Ejemplo n.º 2
0
def add_form(form, entry, data):
    civil = civilrus_convert(form).lower()
    civil_without_er = civil.replace('ъ', '')
    civil_without_erj = civil_without_er.replace('ь', '')
    if civil:
        data[civil].append(entry)
    if civil_without_er != civil:
        data[civil_without_er].append(entry)
    if civil_without_erj != civil and civil_without_erj != civil_without_er:
        data[civil_without_erj].append(entry)
Ejemplo n.º 3
0
def get_reference_hint(wordform, lexeme, without_translit=True, with_ref=True):
    wordform = resolve_titles(wordform)
    hint = {
        KEY_ENTRY: ucs_convert(wordform),
    }
    if not without_translit:
        hint[KEY_CIVIL] = civilrus_convert(wordform)
    if with_ref:
        if isinstance(lexeme, Entry):
            hint[KEY_REFEREE] = get_hint(lexeme)
        else:
            referenced_lexemes = lexeme['referenced_lexemes']
            referee_hint = get_hint(referenced_lexemes[0])
            if (len(referenced_lexemes) > 1
                    and all(e.homonym_order for e in referenced_lexemes)):
                referee_hint[KEY_HOMONYM_ORDER] = u',\u00a0'.join(
                        str(e.homonym_order) for e in referenced_lexemes if e)
            hint[KEY_REFEREE] = referee_hint
    return hint
Ejemplo n.º 4
0
def get_reference_hint(wordform, lexeme, without_translit=True,
                       with_ref=True, with_rnc=False):
    wordform = resolve_titles(wordform)
    hint = {
        KEY_ENTRY: ucs_convert(wordform),
    }
    if not without_translit:
        hint[KEY_CIVIL] = civilrus_convert(wordform)
    if with_ref:
        if isinstance(lexeme, Entry):
            hint[KEY_REFEREE] = get_hint(lexeme, with_rnc=with_rnc)
        else:
            referenced_lexemes = lexeme['referenced_lexemes']
            referee_hint = get_hint(referenced_lexemes[0], with_rnc=with_rnc)
            if (len(referenced_lexemes) > 1
                    and all(e.homonym_order for e in referenced_lexemes)):
                referee_hint[KEY_HOMONYM_ORDER] = ',\u00a0'.join(
                        str(e.homonym_order) for e in referenced_lexemes if e)
            hint[KEY_REFEREE] = referee_hint
    return hint
Ejemplo n.º 5
0
def write_collection(collection, filename):
    filepath = os.path.join(path, filename)
    with open(filepath, 'w') as f:
        for item in sorted(collection, key=lambda x: civilrus_convert(str(x))):
            f.write(str(item))
            f.write('\n')
Ejemplo n.º 6
0
 def __lt__(self, other):
     return civilrus_convert(self.lex) < civilrus_convert(other.lex)
Ejemplo n.º 7
0
with open(os.path.join(path, 'test_stat.txt'), 'w') as f:
    f.write('\n')
    for line in lines:
        print(line)
        f.write(line + '\n')

with open(os.path.join(path, 'vocabulary.csv'), 'w') as f:
    f.write(','.join('"%s"' % title for title in [
        'Заглавное слово', 'Гражданское написание', 'Список словоформ',
        'Запрос для АнтКонка', 'Авторы', 'Комментарий к статье',
        'Номер омонима', 'Смыслоразличительный ярлык омонима',
        'Является ли дубликатом'
    ]))
    f.write('\n')
    last_homonym_number = 0
    _lemmas = list(sorted(lemmas, key=lambda L: civilrus_convert(L.lex)))
    N = len(_lemmas)
    for i, lex in enumerate(_lemmas):
        lemma = lex.lex
        civil = civilrus_convert(lemma)
        next_civil = '' if i + 1 == N else civilrus_convert(_lemmas[i + 1].lex)
        wordforms = get_wordforms(lex)
        query = get_query(lex)
        author = ''
        comment = lex.pos
        if civil == next_civil:
            last_homonym_number += 1
            homonym_number = last_homonym_number
        else:
            if last_homonym_number > 0:
                last_homonym_number += 1
Ejemplo n.º 8
0
        return instance


if not entries3:
    note = 'В словарной базе нету статей, удовлетворящих условиям выгрузки\n'
    note += 'Ни одной статьи и ни одного типа индекса выгружено не будет.\n'
    sys.stderr.write(note)
    print(SHOW_CURSOR, file=sys.stderr)
    sys.exit(0)


# Объединение статей по начальным буквам
letter_parts = []
part_entries = []
first_letter = entries3[0][0].lstrip(' =*')[0]
civil_letter = civilrus_convert(first_letter.lower())
csl_letter = first_letter.upper()
syn_letters = [csl_letter]
entries3_n = len(entries3)
it = enumerate(itertools.groupby(entries3, lambda x: x[0]))
for j, (wordform, group) in it:
    note = 'Группировка статей по начальным буквам [ %s%% ]%s\r' % (
            int(round(j / entries3_n * 100)), ERASE_LINEEND)
    sys.stderr.write(note)
    lst = list(group)
    first_letter = wordform.lstrip(' =*')[0].lower()
    csl_letter = first_letter.upper()
    if civilrus_convert(first_letter) != civil_letter:
        syn_letters.sort(key=sort_key2)
        letter_parts.append((civil_letter, syn_letters, part_entries))
        part_entries = []
Ejemplo n.º 9
0
              file=sys.stderr)
    print(file=sys.stderr)
    return entries


voc = load_data(filepath)
entries = get_entries_index()
#write_diff(voc, entries)
query_voc = get_query_voc(voc, entries)

orphans = []
N = Entry.objects.count()
for i, entry in enumerate(Entry.objects.all()):
    if not entry.antconc_query.strip():
        for form in get_forms(entry):
            civil = re.sub('[ъь]', '', civilrus_convert(form).strip())
            if civil in query_voc:
                orterms = []
                for item in query_voc[civil]:
                    orterms.extend(get_query_orterms(item))
                value = make_query_from_orterms(orterms)
                if value:
                    entry.antconc_query = value
                    entry.save()
                    break
        else:
            orphans.append(entry)
    print('Patching articles with no query [ %i%% ]' % ((i + 1) / N * 100),
          end='\r',
          file=sys.stderr)
print(file=sys.stderr)
Ejemplo n.º 10
0
def in_output_volumes(wordform):
    civil = civilrus_convert(resolve_titles(wordform.strip(' *')))
    return civil[:1].lower() in OUTPUT_VOLUMES_LETTERS