Esempio n. 1
0
class MorphPredictor(PreProcesser):
    
    def __init__(self):
        self.rnnmorph = RNNMorphPredictor(language='ru')
    
    def translit(self, form):
        return (True, translit(form, 'ru')) if re.match(r'[a-zA-Z]+', form) else (False, form)
    
    def transform_sent(self, sent):
        
        sent = sent.copy()
        
        translit_flags, translit_forms = zip(*[self.translit(token.form) for token in sent.tokens])
        morth_forms = self.rnnmorph.predict(translit_forms)
        
        for token, morth_form, translit_flag in zip(sent.tokens, morth_forms, translit_flags):
            
            token.lemma = token.form.lower() if translit_flag else morth_form.normal_form
            token.upos = morth_form.pos
            token.feats = morth_form.tag
        
        return sent
        
    def transform_item(self, x):
        return [self.transform_sent(sent) for sent in x] 
Esempio n. 2
0
def prepare_text(text):
    """
    """

    words = [
        w for w in nltk.word_tokenize(text, language="russian")
        if w not in punctuation
    ]

    predictor = RNNMorphPredictor(language="ru")
    morphs = predictor.predict(words)

    return ["{}_{}".format(m.normal_form, m.pos) for m in morphs]
Esempio n. 3
0
def find_rhyme(src: str, russian_lexemes: dict, rnn_morph: RNNMorphPredictor,
               phonetic_dict: Dict[str, tuple]) -> List[str]:
    russian_letters = set('АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя')
    src_words = list(filter(
        lambda it2: set(it2) <= russian_letters,
        map(lambda it1: it1.strip().lower(), word_tokenize(src))
    ))
    if len(src_words) == 0:
        return [src]
    morphotags = [get_morphodata(cur.pos + ' ' + cur.tag) for cur in rnn_morph.predict(src_words)]
    print('morphotags', morphotags)
    syllables_of_words = [str(calc_number_of_syllables(cur_word)) for cur_word in src_words]
    print('syllables_of_words', syllables_of_words)
    variants = []
    new_variant = []
    for it in select_new_variant(src_words, morphotags, syllables_of_words, russian_lexemes, phonetic_dict, 0,
                                 new_variant):
        variants.append(' '.join(it))
        del it
    return variants
Esempio n. 4
0
class TaggerEnsemble:
    def __init__(self):
        self.predictor = RNNMorphPredictor(language="ru")

        self.tagger = rupostagger.RuPosTagger()
        self.tagger.load()

        #model_file = '/home/inkoziev/polygon/GramEval2020/tmp/udpipe_syntagrus.model'
        #self.ud_model = Model.load(model_file)
        #self.ud_pipeline = Pipeline(self.ud_model, 'vertical', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
        #self.ud_error = ProcessingError()

    def tag(self, words):
        tokens1 = self.tagger.tag(words)
        tokens2 = self.predictor.predict(words)

        #processed = self.ud_pipeline.process('\n'.join(words), self.ud_error)
        #if self.ud_error.occurred():
        #    print("An error occurred when running run_udpipe: ")
        #    print(self.ud_error.message)
        #    return tokens1
        #tokens3 = pyconll.load_from_string(processed)[0]

        new_tokens = []
        for token1, token2 in zip(tokens1, tokens2):
            tags1 = token1[1].split('|')
            if tags1[0] == 'NOUN' and 'Case' in token2.tag:
                tags_rnn = dict(
                    z.split('=') for z in token2.tag.split('|') if '=' in z)
                new_tagset = list(
                    filter(lambda z: not z.startswith('Case'), tags1))
                new_tagset.append(('Case=' + tags_rnn['Case']))
                new_tokens.append((token1[0], '|'.join(new_tagset)))
            else:
                new_tokens.append(token1)

        return new_tokens
Esempio n. 5
0
from rnnmorph.predictor import RNNMorphPredictor
from pprint import pprint

if __name__ == '__main__':
    pr = RNNMorphPredictor(language='ru')
    forms = pr.predict(words=['мама', 'мыла', 'раму'])
    for i in forms:
        print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag))

    forms = pr.predict_sentences(sentences=[['Всем', 'привет']])
    for i in forms[0]:
        print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag))

    pprint(forms)
class RNNMorphWrapper:
    """
    Класс предназначен для получения граммемной информации о токенах.
    """
    def __init__(self):
        self._graph = tf.Graph()
        self._session = tf.Session(graph=self._graph)
        with self._session.as_default():
            with self._graph.as_default():
                self.rnnmorph = RNNMorphPredictor(language="ru")
        self.pymorphy_analyzer = pymorphy2.MorphAnalyzer()
        self.latin = re.compile("^[0-9]*[A-Za-z]+[0-9]*$")
        self.cyrillic = re.compile("[А-Яа-яЁе]+")

    def _choose_pymorphy_form(self, word, lemma, pos):
        hypotheses = self.pymorphy_analyzer.parse(word)
        hyp = None
        tags_to_add = {}
        other = ""
        for hyp in hypotheses:
            if hyp.normal_form == lemma:
                break
        changed_lemma = lemma.replace("ё", "е")
        if not hyp:
            return other, tags_to_add, changed_lemma
        str_tag = str(hyp.tag)
        if "Surn" in str_tag:
            other = "фам"
            changed_lemma = word.lower().replace("ё", "е")
        elif "Patr" in str_tag:
            other = "отч"
            changed_lemma = word.lower().replace(
                "ё", "е")  # у Петрович лемма внезапно Пётр
        if hyp.tag.transitivity:
            tags_to_add[TRANSITIVITY] = str(hyp.tag.transitivity)
        if hyp.tag.animacy and pos == "NOUN":
            tags_to_add[ANIMACY] = str(hyp.tag.animacy)
        if hyp.tag.aspect:
            tags_to_add[ASPECT] = str(hyp.tag.aspect)
        return other, tags_to_add, changed_lemma

    def _change_pos(self, token, analysis):
        if re.match(self.latin, analysis.word):
            token[GRAMMEM_INFO][PART_OF_SPEECH] = "X"
        elif analysis.pos == "PUNCT" and re.search(self.cyrillic,
                                                   analysis.word):
            token[GRAMMEM_INFO][PART_OF_SPEECH] = "X"
        else:
            token[GRAMMEM_INFO][PART_OF_SPEECH] = analysis.pos
        return token

    def _gram_info_processing(self, tags_to_add, analysis):
        gramme_info = {}
        raw_gram_data = []
        if analysis.tag != "_":
            for tag in analysis.tag.split("|"):
                gramme_info[tag.split("=")[0].lower()] = tag.split(
                    "=")[1].lower()
            gramme_info.update(tags_to_add)
        sorted_gramme_info = {
            key: gramme_info[key]
            for key in sorted(gramme_info.keys())
        }
        for key in sorted_gramme_info:
            raw_gram_data.append(key + "=" + sorted_gramme_info[key])
        raw_gram_info = "|".join(raw_gram_data)
        return sorted_gramme_info, raw_gram_info

    def _rnnmorph_to_token_dicti(self, token, analysis):
        additional_info, tags_to_add, changed_lemma = self._choose_pymorphy_form(
            analysis.word, analysis.normal_form, analysis.pos)
        sorted_gramme_info, raw_gram_info = self._gram_info_processing(
            tags_to_add, analysis)
        token[GRAMMEM_INFO] = sorted_gramme_info
        token[GRAMMEM_INFO][RAW_GRAM_INFO] = raw_gram_info
        if additional_info:
            token[GRAMMEM_INFO][OTHER] = additional_info
        token = self._change_pos(token, analysis)
        token[LEMMA] = changed_lemma
        return token

    def token_desc_list_processing(self, token_desc_list):
        """
        Получить список токенов с описанием
        :param: Список из словарей
        :return: Список из словарей, обогащенный морфологической информацией
        """
        raw_token_list = [token[TEXT] for token in token_desc_list]
        with self._session.as_default():
            with self._graph.as_default():
                analyze_result = self.rnnmorph.predict(raw_token_list)

        res = []
        for i in range(len(token_desc_list)):
            analysis = analyze_result[i]
            tokenized_element = token_desc_list[i]
            final_tokenized_element = self._rnnmorph_to_token_dicti(
                tokenized_element, analysis)
            res.append(final_tokenized_element)
        return res

    def __call__(self, token_desc_list):
        """
        Класс предназначен для забора из RNNMorph + pymorphy2 граммемной информации.
        На вход принимается список токенов
        На выходе имеем список токенов с проставленными грамматическими атрибутами

        :param token_desc_list (list of dicts)
        :return: final_result (enriched list of dicts)
        """
        final_result = []
        sentences = token_list_to_sentences(token_desc_list)
        for sentence in sentences:
            final_result.extend(self.token_desc_list_processing(sentence))
            if final_result:
                final_result.append({
                    TEXT:
                    ".",
                    LEMMA:
                    ".",
                    TOKEN_TYPE:
                    SENTENCE_ENDPOINT_TOKEN,
                    TOKEN_VALUE: {
                        VALUE: "."
                    },
                    LIST_OF_TOKEN_TYPES_DATA: [{
                        TOKEN_TYPE: SENTENCE_ENDPOINT_TOKEN,
                        TOKEN_VALUE: {
                            VALUE: "."
                        }
                    }]
                })
        return final_result