Beispiel #1
0
 def pos_tag(self):
     if self.language == "ru" or self.language == "en":
         os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
         predictor = RNNMorphPredictor(language=self.language)
         sentences = []
         for review in self.reviews:
             for i, sentence in enumerate(review.sentences):
                 words = [word.text for word in sentence]
                 sentences.append(words)
         sentences_forms = predictor.predict_sentences(sentences, 32, False)
         offset = 0
         for review in self.reviews:
             for i, sentence in enumerate(review.sentences):
                 forms = sentences_forms[offset + i]
                 for word_idx, form in enumerate(forms):
                     sentence[word_idx] = PosTaggedWord(
                         sentence[word_idx], form.pos, form.tag,
                         [int(j) for j in form.vector])
             offset += len(review.sentences)
         os.environ['CUDA_VISIBLE_DEVICES'] = '0'
Beispiel #2
0
def tag(predictor: RNNMorphPredictor, untagged_filename: str,
        tagged_filename: str):
    sentences = []
    with open(untagged_filename, "r", encoding='utf-8') as r:
        words = []
        for line in r:
            if line != "\n":
                records = line.strip().split("\t")
                word = records[1]
                words.append(word)
            else:
                sentences.append([word for word in words])
                words = []
    with open(tagged_filename, "w", encoding='utf-8') as w:
        all_forms = predictor.predict_sentences(sentences)
        for forms in all_forms:
            for i, form in enumerate(forms):
                line = "{}\t{}\t{}\t{}\t{}\n".format(str(i + 1), form.word,
                                                     form.normal_form,
                                                     form.pos, form.tag)
                w.write(line)
            w.write("\n")
Beispiel #3
0
 def handle_new_messages(self):
     pr = RNNMorphPredictor()
     self.mycursor.execute(
         "SELECT id, text FROM messages WHERE handled IS NULL")
     message_records = self.mycursor.fetchall()
     emoticons_count = 0
     words_count = 0
     for message_record in message_records:
         msg_id, text = message_record
         if text:
             emoticons = regex.findall(r'\X', text)
             for emoticon in emoticons:
                 if any(char in emoji.UNICODE_EMOJI for char in emoticon):
                     self.mycursor.execute(
                         f"INSERT INTO emoticons (message_id, emoticon) VALUES (%s, %s);",
                         (msg_id, emoticon))
                     emoticons_count += 1
             sentences = []
             for sentence in re.split(r'[.!?]+', re.sub(r'[ёЁ]', 'е',
                                                        text)):
                 word_list = re.findall(
                     r'[а-яА-ЯёЁ]+-[а-яА-ЯёЁ]+|[а-яА-ЯёЁ]+', sentence)
                 if word_list:
                     sentences.append(word_list)
             if sentences:
                 pr_sentences = pr.predict_sentences(sentences=sentences)
                 for pr_sentence in pr_sentences:
                     for pr_word in pr_sentence:
                         self.mycursor.execute(
                             f"INSERT INTO words (message_id, word, normal_form, pos, tag) "
                             f"VALUES (%s, %s, %s, %s, %s);",
                             (msg_id, pr_word.word, pr_word.normal_form,
                              pr_word.pos, pr_word.tag))
                         words_count += 1
         self.mycursor.execute(
             "UPDATE messages SET handled = %s WHERE id = %s",
             (True, message_record[0]))
         self.mydb.commit()
     return len(message_records), words_count, emoticons_count
Beispiel #4
0
class Preprocessor():
    def __init__(self, batch_size=1):
        self.batch_size = batch_size
        self.predictor = RNNMorphPredictor(language="ru")

    def __del__(self):
        if hasattr(self, 'predictor'):
            del self.predictor

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def __deepcopy__(self, memodict={}):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def gettags(self, texts):
        if not isinstance(texts, list):
            raise ValueError('Expected `{0}`, but got `{1}`.'.format(
                type([1, 2]), type(texts)))
        if len(texts) == 0:
            return []
        all_phonetic_phrases = []
        all_phrases_for_rnnmorph = []
        for cur_text in texts:
            list_of_phonetic_phrases = [
                cur.strip() for cur in ' '.join(cur_text).split('<sil>')
            ]
            united_phrase_for_rnnmorph = []
            for phonetic_phrase in list_of_phonetic_phrases:
                if len(phonetic_phrase) > 0:
                    united_phrase_for_rnnmorph += phonetic_phrase.split()
            if len(united_phrase_for_rnnmorph) > 0:
                all_phrases_for_rnnmorph.append(united_phrase_for_rnnmorph)
                all_phonetic_phrases.append(list_of_phonetic_phrases)
            else:
                all_phonetic_phrases.append([])
        if len(all_phrases_for_rnnmorph) > 0:
            all_forms = self.predictor.predict_sentences(
                all_phrases_for_rnnmorph, batch_size=self.batch_size)
        else:
            all_forms = []
        all_words_and_tags = []
        phrase_ind = 0
        for cur in all_phonetic_phrases:
            words_and_tags = [['<sil>', 'SIL _']]
            if len(cur) > 0:
                token_ind = 0
                for phonetic_phrase in cur:
                    if len(phonetic_phrase) > 0:
                        n = len(phonetic_phrase.split(' '))
                        analysis = all_forms[phrase_ind][token_ind:(token_ind +
                                                                    n)]
                        for word in analysis:
                            word_and_tag = []
                            word_and_tag.append(word.word)
                            word_and_tag.append(word.pos + ' ' + word.tag)
                            words_and_tags.append(word_and_tag)
                        words_and_tags.append(['<sil>', 'SIL _'])
                        token_ind += n
                phrase_ind += 1
            all_words_and_tags.append(words_and_tags)
        return all_words_and_tags

    def preprocessing(self, texts):
        def prepare(src):
            dst = sub('[\.\,\?\!\(\);:]+', ' <sil>', src.lower())
            dst = sub(' [–-] |\n', ' <sil> ', dst)
            dst = sub('\s{2,}', ' ', dst)
            dst = sub('^\s|(?<!\w)[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '',
                      dst)
            return dst.strip().split(' ')

        words_and_tags = self.gettags([prepare(cur) for cur in texts])
        return words_and_tags
Beispiel #5
0
class Preprocessor():
    """[summary]
    """
    def __init__(self, batch_size=1):
        """[summary]

        Args:
            batch_size (int, optional): [description]. Defaults to 1.
        """

        self.batch_size = batch_size
        self.predictor = RNNMorphPredictor(language="ru")

    def __del__(self):
        if hasattr(self, 'predictor'):
            del self.predictor

    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def __deepcopy__(self, memodict={}):
        cls = self.__class__
        result = cls.__new__(cls)
        result.predictor = self.predictor
        return result

    def gettags(self, texts: list) -> list:
        """Get morpho tags for the `texts`

        Args:
            texts (list): List of lists

        Raises:
            ValueError: [description]

        Returns:
            list: list of lists -- words and motpho tags
            
        Example:            
            PreProcess.gettags([['я купил самолёт и ракеты'], ['ух ты']])
            [[['<sil>', 'SIL _'],
              ['я', 'PRON Case=Nom|Number=Sing|Person=1'],
              ['купил', 'VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'],
              ['самолёт', 'NOUN Case=Acc|Gender=Masc|Number=Sing'],
              ['и', 'CONJ _'],
              ['ракеты', 'NOUN Case=Acc|Gender=Fem|Number=Plur'],
              ['<sil>', 'SIL _']],
             [['<sil>', 'SIL _'],
              ['ух', 'INTJ _'],
              ['ты', 'PRON Case=Nom|Number=Sing|Person=2'],
              ['<sil>', 'SIL _']]]
        """

        if not isinstance(texts, list):
            raise ValueError(
                f'Expected `{type([1, 2])}`, but got `{type(texts)}`.')
        if len(texts) == 0:
            return []
        all_phonetic_phrases = []
        all_phrases_for_rnnmorph = []
        for cur_text in texts:
            list_of_phonetic_phrases = [
                cur.strip() for cur in ' '.join(cur_text).split('<sil>')
            ]
            united_phrase_for_rnnmorph = []
            for phonetic_phrase in list_of_phonetic_phrases:
                if len(phonetic_phrase) > 0:
                    united_phrase_for_rnnmorph += phonetic_phrase.split()
            if len(united_phrase_for_rnnmorph) > 0:
                all_phrases_for_rnnmorph.append(united_phrase_for_rnnmorph)
                all_phonetic_phrases.append(list_of_phonetic_phrases)
            else:
                all_phonetic_phrases.append([])
        if len(all_phrases_for_rnnmorph) > 0:
            all_forms = self.predictor.predict_sentences(all_phrases_for_rnnmorph, \
                                                         batch_size=self.batch_size)
        else:
            all_forms = []
        all_words_and_tags = []
        phrase_ind = 0

        for cur in all_phonetic_phrases:
            words_and_tags = [['<sil>', 'SIL _']]
            if len(cur) > 0:
                token_ind = 0
                for phonetic_phrase in cur:
                    if len(phonetic_phrase) > 0:
                        n = len(phonetic_phrase.split(' '))
                        analysis = all_forms[phrase_ind][token_ind:(token_ind +
                                                                    n)]
                        for word in analysis:
                            word_and_tag = []
                            word_and_tag.append(word.word)
                            word_and_tag.append(word.pos + ' ' + word.tag)
                            words_and_tags.append(word_and_tag)
                        words_and_tags.append(['<sil>', 'SIL _'])
                        token_ind += n
                phrase_ind += 1
            all_words_and_tags.append(words_and_tags)
        return all_words_and_tags

    def __call__(self, texts: str):
        """Call the instance like function. Use in pipelines, too."""
        return self.preprocessing(texts)[0]

    def preprocessing(self, texts: str):
        """[summary]

        Args:
            texts (str): Text to preprocess.

        Returns:
            list: A list of processed words and tags.
        """
        def prepare(text: str) -> str:
            """Replace punctuation marks with <sil> tag; remove special symbols."""

            text = sub(r'[\.\,\?\!\(\);:]+', ' <sil>', text.lower())
            text = sub(r' [–-] |\n', ' <sil> ', text)
            text = sub(r'\s{2,}', ' ', text)
            text = sub(r'^\s|(?<!\w)[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '',
                       text)
            return text.strip().split(' ')

        return self.gettags([prepare(cur) for cur in texts])
Beispiel #6
0
from rnnmorph.predictor import RNNMorphPredictor
from pprint import pprint

if __name__ == '__main__':
    pr = RNNMorphPredictor(language='ru')
    forms = pr.predict(words=['мама', 'мыла', 'раму'])
    for i in forms:
        print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag))

    forms = pr.predict_sentences(sentences=[['Всем', 'привет']])
    for i in forms[0]:
        print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag))

    pprint(forms)
                 for tt1, tt2 in zip(t1, t2) if tt1[1] == tt2[1]))
num_pred = sum(1 for s in test_sents for _ in s)
print(f"{true_pred / num_pred * 100:.1f}")

# In[34]:

get_ipython().run_cell_magic('capture', '', '!pip install -q rnnmorph')

# In[35]:

from rnnmorph.predictor import RNNMorphPredictor
predictor = RNNMorphPredictor(language="en")

# In[36]:

rnnmorph_result = predictor.predict_sentences(
    [list(map(lambda t: t[0], s)) for s in test_sents])

# In[37]:

true_pred = sum((1 for t1, t2 in zip(rnnmorph_result, test_sents)
                 for tt1, tt2 in zip(t1, t2) if tt1.pos == tt2[1]))
num_pred = sum(1 for s in test_sents for _ in s)
print(f"{true_pred / num_pred * 100:.1f}")

# ### Вопрос 7:
# * Какое качество вы получили, используя каждую из двух библиотек? Сравните их результаты.
#
# * Качество с библиотекой rnnmorph должно быть хуже, так как там используется немного другая система тэгов. Какие здесь отличия?

# >nltk_model = 89.2, rnnmorph_model = 63.2
#