Exemple #1
0
def lemmatize_corpus(corpus):
    lemmas = set()
    orphan_words = set()
    for word in corpus:
        lexemes = lem.find_lexemes(word)
        if lexemes is None:
            lexemes = lem.find_lexemes(resolve_titles(word))
        if lexemes is None:
            lexemes = lem._find_lexemes(antconc_wordform_query(word))
        if lexemes is None:
            lexemes = lem._find_lexemes(
                antconc_wordform_query(resolve_titles(word)))
        if lexemes is not None:
            for lex in lexemes:
                if lex.is_graph_num or lex in lemmas:
                    continue
                lemmas.add(lex)
        if lexemes is None:
            orphan_words.add(word)
    return lemmas, orphan_words
Exemple #2
0
    def add_data(self, data):
        for item in data:
            lex, pos, flextype, gram, word, freq = item
            if word not in self.words:
                self.words[word] = Word(word)
            word2 = re.sub('[\+\#\!\$]', '', word)
            if word != word2:
                word = word2
                if word not in self.words:
                    self.words[word] = Word(word)

            word_without_titles = ac2p(resolve_titles(p2ac(word)))
            if word_without_titles not in self.words:
                self.words[word_without_titles] = Word(word_without_titles)

            wordform_query = antconc_wordform_query(resolve_titles(p2ac(word)))
            if wordform_query not in self.words:
                self.words[wordform_query] = Word(wordform_query,
                                                  is_antconc_query=True)

            wordform_query2 = antconc_wordform_query(p2ac(word))
            if wordform_query2 not in self.words:
                self.words[wordform_query2] = Word(wordform_query2,
                                                   is_antconc_query=True)

            if (lex, pos, flextype) not in self.lexemes:
                self.lexemes[(lex, pos, flextype)] = \
                        Lexeme(lex, pos, flextype, gram)

            L = self.lexemes[(lex, pos, flextype)]
            Ws = [
                self.words[w] for w in
                {word, word_without_titles, wordform_query, wordform_query2}
            ]

            for W in Ws:
                g = Gram(gram, freq, W, L)
                W.grams.append(g)
                L.grams.append(g)
Exemple #3
0
def get_reference_hint(wordform, lexeme, without_translit=True, with_ref=True):
    wordform = resolve_titles(wordform)
    hint = {
        KEY_ENTRY: ucs_convert(wordform),
    }
    if not without_translit:
        hint[KEY_CIVIL] = civilrus_convert(wordform)
    if with_ref:
        if isinstance(lexeme, Entry):
            hint[KEY_REFEREE] = get_hint(lexeme)
        else:
            referenced_lexemes = lexeme['referenced_lexemes']
            referee_hint = get_hint(referenced_lexemes[0])
            if (len(referenced_lexemes) > 1
                    and all(e.homonym_order for e in referenced_lexemes)):
                referee_hint[KEY_HOMONYM_ORDER] = u',\u00a0'.join(
                        str(e.homonym_order) for e in referenced_lexemes if e)
            hint[KEY_REFEREE] = referee_hint
    return hint
Exemple #4
0
def get_reference_hint(wordform, lexeme, without_translit=True,
                       with_ref=True, with_rnc=False):
    wordform = resolve_titles(wordform)
    hint = {
        KEY_ENTRY: ucs_convert(wordform),
    }
    if not without_translit:
        hint[KEY_CIVIL] = civilrus_convert(wordform)
    if with_ref:
        if isinstance(lexeme, Entry):
            hint[KEY_REFEREE] = get_hint(lexeme, with_rnc=with_rnc)
        else:
            referenced_lexemes = lexeme['referenced_lexemes']
            referee_hint = get_hint(referenced_lexemes[0], with_rnc=with_rnc)
            if (len(referenced_lexemes) > 1
                    and all(e.homonym_order for e in referenced_lexemes)):
                referee_hint[KEY_HOMONYM_ORDER] = ',\u00a0'.join(
                        str(e.homonym_order) for e in referenced_lexemes if e)
            hint[KEY_REFEREE] = referee_hint
    return hint
Exemple #5
0
lexemes_n = len(lexemes)
print >> sys.stderr, 'Number of selected lexemes:', lexemes_n
print >> sys.stderr

for i, lexeme in enumerate(lexemes):

    wordform = lexeme.base_vars[0].idem
    reference = None
    entries1.append((wordform, reference, lexeme))
    key = sort_key1(wordform)

    # Разные ссылочные статьи в пределах выбранных томов

    # 1) Варианты заглавного слова
    for var in lexeme.orth_vars_refs[1:]:
        wordform = resolve_titles(var.idem)
        key2 = sort_key1(wordform)
        if key2 != key:
            reference = ucs_convert(wordform)
            entries1.append((wordform, reference, lexeme))

    # 2) Названия народов
    COMMA = ur',\s+'
    if lexeme.nom_sg:
        wordform = lexeme.nom_sg
        reference = lexeme.nom_sg_ucs_wax[1]
        for wordform, reference in zip(
                re.split(COMMA, wordform), re.split(COMMA, reference)):
            entries1.append((wordform, reference, lexeme))

    # 3) Краткие формы
Exemple #6
0
lexemes_n = len(lexemes)
print('Number of selected lexemes:', lexemes_n, file=sys.stderr)
print(file=sys.stderr)

for i, lexeme in enumerate(lexemes):

    wordform = lexeme.base_vars[0].idem
    reference = None
    entries1.append((wordform, reference, lexeme))
    key = sort_key1(wordform)

    # Разные ссылочные статьи в пределах выбранных томов

    # 1) Варианты заглавного слова
    for var in lexeme.orth_vars_refs[1:]:
        wordform = resolve_titles(var.idem)
        key2 = sort_key1(wordform)
        if key2 != key:
            reference = ucs_convert(wordform)
            entries1.append((wordform, reference, lexeme))

    # 2) Названия народов
    COMMA = r',\s+'
    if lexeme.nom_pl:
        wordform = lexeme.nom_pl
        reference = lexeme.nom_pl_ucs_wax[1]
        for wordform, reference in zip(
                re.split(COMMA, wordform), re.split(COMMA, reference)):
            entries1.append((wordform, reference, lexeme))

    # 3) Краткие формы
Exemple #7
0
def in_output_volumes(wordform):
    civil = civilrus_convert(resolve_titles(wordform.strip(' *')))
    return civil[:1].lower() in OUTPUT_VOLUMES_LETTERS