def lemmatize_corpus(corpus): lemmas = set() orphan_words = set() for word in corpus: lexemes = lem.find_lexemes(word) if lexemes is None: lexemes = lem.find_lexemes(resolve_titles(word)) if lexemes is None: lexemes = lem._find_lexemes(antconc_wordform_query(word)) if lexemes is None: lexemes = lem._find_lexemes( antconc_wordform_query(resolve_titles(word))) if lexemes is not None: for lex in lexemes: if lex.is_graph_num or lex in lemmas: continue lemmas.add(lex) if lexemes is None: orphan_words.add(word) return lemmas, orphan_words
def add_data(self, data): for item in data: lex, pos, flextype, gram, word, freq = item if word not in self.words: self.words[word] = Word(word) word2 = re.sub('[\+\#\!\$]', '', word) if word != word2: word = word2 if word not in self.words: self.words[word] = Word(word) word_without_titles = ac2p(resolve_titles(p2ac(word))) if word_without_titles not in self.words: self.words[word_without_titles] = Word(word_without_titles) wordform_query = antconc_wordform_query(resolve_titles(p2ac(word))) if wordform_query not in self.words: self.words[wordform_query] = Word(wordform_query, is_antconc_query=True) wordform_query2 = antconc_wordform_query(p2ac(word)) if wordform_query2 not in self.words: self.words[wordform_query2] = Word(wordform_query2, is_antconc_query=True) if (lex, pos, flextype) not in self.lexemes: self.lexemes[(lex, pos, flextype)] = \ Lexeme(lex, pos, flextype, gram) L = self.lexemes[(lex, pos, flextype)] Ws = [ self.words[w] for w in {word, word_without_titles, wordform_query, wordform_query2} ] for W in Ws: g = Gram(gram, freq, W, L) W.grams.append(g) L.grams.append(g)
def get_reference_hint(wordform, lexeme, without_translit=True, with_ref=True): wordform = resolve_titles(wordform) hint = { KEY_ENTRY: ucs_convert(wordform), } if not without_translit: hint[KEY_CIVIL] = civilrus_convert(wordform) if with_ref: if isinstance(lexeme, Entry): hint[KEY_REFEREE] = get_hint(lexeme) else: referenced_lexemes = lexeme['referenced_lexemes'] referee_hint = get_hint(referenced_lexemes[0]) if (len(referenced_lexemes) > 1 and all(e.homonym_order for e in referenced_lexemes)): referee_hint[KEY_HOMONYM_ORDER] = u',\u00a0'.join( str(e.homonym_order) for e in referenced_lexemes if e) hint[KEY_REFEREE] = referee_hint return hint
def get_reference_hint(wordform, lexeme, without_translit=True, with_ref=True, with_rnc=False): wordform = resolve_titles(wordform) hint = { KEY_ENTRY: ucs_convert(wordform), } if not without_translit: hint[KEY_CIVIL] = civilrus_convert(wordform) if with_ref: if isinstance(lexeme, Entry): hint[KEY_REFEREE] = get_hint(lexeme, with_rnc=with_rnc) else: referenced_lexemes = lexeme['referenced_lexemes'] referee_hint = get_hint(referenced_lexemes[0], with_rnc=with_rnc) if (len(referenced_lexemes) > 1 and all(e.homonym_order for e in referenced_lexemes)): referee_hint[KEY_HOMONYM_ORDER] = ',\u00a0'.join( str(e.homonym_order) for e in referenced_lexemes if e) hint[KEY_REFEREE] = referee_hint return hint
lexemes_n = len(lexemes) print >> sys.stderr, 'Number of selected lexemes:', lexemes_n print >> sys.stderr for i, lexeme in enumerate(lexemes): wordform = lexeme.base_vars[0].idem reference = None entries1.append((wordform, reference, lexeme)) key = sort_key1(wordform) # Разные ссылочные статьи в пределах выбранных томов # 1) Варианты заглавного слова for var in lexeme.orth_vars_refs[1:]: wordform = resolve_titles(var.idem) key2 = sort_key1(wordform) if key2 != key: reference = ucs_convert(wordform) entries1.append((wordform, reference, lexeme)) # 2) Названия народов COMMA = ur',\s+' if lexeme.nom_sg: wordform = lexeme.nom_sg reference = lexeme.nom_sg_ucs_wax[1] for wordform, reference in zip( re.split(COMMA, wordform), re.split(COMMA, reference)): entries1.append((wordform, reference, lexeme)) # 3) Краткие формы
lexemes_n = len(lexemes) print('Number of selected lexemes:', lexemes_n, file=sys.stderr) print(file=sys.stderr) for i, lexeme in enumerate(lexemes): wordform = lexeme.base_vars[0].idem reference = None entries1.append((wordform, reference, lexeme)) key = sort_key1(wordform) # Разные ссылочные статьи в пределах выбранных томов # 1) Варианты заглавного слова for var in lexeme.orth_vars_refs[1:]: wordform = resolve_titles(var.idem) key2 = sort_key1(wordform) if key2 != key: reference = ucs_convert(wordform) entries1.append((wordform, reference, lexeme)) # 2) Названия народов COMMA = r',\s+' if lexeme.nom_pl: wordform = lexeme.nom_pl reference = lexeme.nom_pl_ucs_wax[1] for wordform, reference in zip( re.split(COMMA, wordform), re.split(COMMA, reference)): entries1.append((wordform, reference, lexeme)) # 3) Краткие формы
def in_output_volumes(wordform): civil = civilrus_convert(resolve_titles(wordform.strip(' *'))) return civil[:1].lower() in OUTPUT_VOLUMES_LETTERS