def pos_tag(self): if self.language == "ru" or self.language == "en": os.environ['CUDA_VISIBLE_DEVICES'] = '-1' predictor = RNNMorphPredictor(language=self.language) sentences = [] for review in self.reviews: for i, sentence in enumerate(review.sentences): words = [word.text for word in sentence] sentences.append(words) sentences_forms = predictor.predict_sentences(sentences, 32, False) offset = 0 for review in self.reviews: for i, sentence in enumerate(review.sentences): forms = sentences_forms[offset + i] for word_idx, form in enumerate(forms): sentence[word_idx] = PosTaggedWord( sentence[word_idx], form.pos, form.tag, [int(j) for j in form.vector]) offset += len(review.sentences) os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def tag(predictor: RNNMorphPredictor, untagged_filename: str, tagged_filename: str): sentences = [] with open(untagged_filename, "r", encoding='utf-8') as r: words = [] for line in r: if line != "\n": records = line.strip().split("\t") word = records[1] words.append(word) else: sentences.append([word for word in words]) words = [] with open(tagged_filename, "w", encoding='utf-8') as w: all_forms = predictor.predict_sentences(sentences) for forms in all_forms: for i, form in enumerate(forms): line = "{}\t{}\t{}\t{}\t{}\n".format(str(i + 1), form.word, form.normal_form, form.pos, form.tag) w.write(line) w.write("\n")
def handle_new_messages(self): pr = RNNMorphPredictor() self.mycursor.execute( "SELECT id, text FROM messages WHERE handled IS NULL") message_records = self.mycursor.fetchall() emoticons_count = 0 words_count = 0 for message_record in message_records: msg_id, text = message_record if text: emoticons = regex.findall(r'\X', text) for emoticon in emoticons: if any(char in emoji.UNICODE_EMOJI for char in emoticon): self.mycursor.execute( f"INSERT INTO emoticons (message_id, emoticon) VALUES (%s, %s);", (msg_id, emoticon)) emoticons_count += 1 sentences = [] for sentence in re.split(r'[.!?]+', re.sub(r'[ёЁ]', 'е', text)): word_list = re.findall( r'[а-яА-ЯёЁ]+-[а-яА-ЯёЁ]+|[а-яА-ЯёЁ]+', sentence) if word_list: sentences.append(word_list) if sentences: pr_sentences = pr.predict_sentences(sentences=sentences) for pr_sentence in pr_sentences: for pr_word in pr_sentence: self.mycursor.execute( f"INSERT INTO words (message_id, word, normal_form, pos, tag) " f"VALUES (%s, %s, %s, %s, %s);", (msg_id, pr_word.word, pr_word.normal_form, pr_word.pos, pr_word.tag)) words_count += 1 self.mycursor.execute( "UPDATE messages SET handled = %s WHERE id = %s", (True, message_record[0])) self.mydb.commit() return len(message_records), words_count, emoticons_count
class Preprocessor(): def __init__(self, batch_size=1): self.batch_size = batch_size self.predictor = RNNMorphPredictor(language="ru") def __del__(self): if hasattr(self, 'predictor'): del self.predictor def __copy__(self): cls = self.__class__ result = cls.__new__(cls) result.predictor = self.predictor return result def __deepcopy__(self, memodict={}): cls = self.__class__ result = cls.__new__(cls) result.predictor = self.predictor return result def gettags(self, texts): if not isinstance(texts, list): raise ValueError('Expected `{0}`, but got `{1}`.'.format( type([1, 2]), type(texts))) if len(texts) == 0: return [] all_phonetic_phrases = [] all_phrases_for_rnnmorph = [] for cur_text in texts: list_of_phonetic_phrases = [ cur.strip() for cur in ' '.join(cur_text).split('<sil>') ] united_phrase_for_rnnmorph = [] for phonetic_phrase in list_of_phonetic_phrases: if len(phonetic_phrase) > 0: united_phrase_for_rnnmorph += phonetic_phrase.split() if len(united_phrase_for_rnnmorph) > 0: all_phrases_for_rnnmorph.append(united_phrase_for_rnnmorph) all_phonetic_phrases.append(list_of_phonetic_phrases) else: all_phonetic_phrases.append([]) if len(all_phrases_for_rnnmorph) > 0: all_forms = self.predictor.predict_sentences( all_phrases_for_rnnmorph, batch_size=self.batch_size) else: all_forms = [] all_words_and_tags = [] phrase_ind = 0 for cur in all_phonetic_phrases: words_and_tags = [['<sil>', 'SIL _']] if len(cur) > 0: token_ind = 0 for phonetic_phrase in cur: if len(phonetic_phrase) > 0: n = len(phonetic_phrase.split(' ')) analysis = all_forms[phrase_ind][token_ind:(token_ind + n)] for word in analysis: word_and_tag = [] word_and_tag.append(word.word) word_and_tag.append(word.pos + ' ' + word.tag) words_and_tags.append(word_and_tag) words_and_tags.append(['<sil>', 'SIL _']) token_ind += n phrase_ind += 1 all_words_and_tags.append(words_and_tags) return all_words_and_tags def preprocessing(self, texts): def prepare(src): dst = sub('[\.\,\?\!\(\);:]+', ' <sil>', src.lower()) dst = sub(' [–-] |\n', ' <sil> ', dst) dst = sub('\s{2,}', ' ', dst) dst = sub('^\s|(?<!\w)[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '', dst) return dst.strip().split(' ') words_and_tags = self.gettags([prepare(cur) for cur in texts]) return words_and_tags
class Preprocessor(): """[summary] """ def __init__(self, batch_size=1): """[summary] Args: batch_size (int, optional): [description]. Defaults to 1. """ self.batch_size = batch_size self.predictor = RNNMorphPredictor(language="ru") def __del__(self): if hasattr(self, 'predictor'): del self.predictor def __copy__(self): cls = self.__class__ result = cls.__new__(cls) result.predictor = self.predictor return result def __deepcopy__(self, memodict={}): cls = self.__class__ result = cls.__new__(cls) result.predictor = self.predictor return result def gettags(self, texts: list) -> list: """Get morpho tags for the `texts` Args: texts (list): List of lists Raises: ValueError: [description] Returns: list: list of lists -- words and motpho tags Example: PreProcess.gettags([['я купил самолёт и ракеты'], ['ух ты']]) [[['<sil>', 'SIL _'], ['я', 'PRON Case=Nom|Number=Sing|Person=1'], ['купил', 'VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act'], ['самолёт', 'NOUN Case=Acc|Gender=Masc|Number=Sing'], ['и', 'CONJ _'], ['ракеты', 'NOUN Case=Acc|Gender=Fem|Number=Plur'], ['<sil>', 'SIL _']], [['<sil>', 'SIL _'], ['ух', 'INTJ _'], ['ты', 'PRON Case=Nom|Number=Sing|Person=2'], ['<sil>', 'SIL _']]] """ if not isinstance(texts, list): raise ValueError( f'Expected `{type([1, 2])}`, but got `{type(texts)}`.') if len(texts) == 0: return [] all_phonetic_phrases = [] all_phrases_for_rnnmorph = [] for cur_text in texts: list_of_phonetic_phrases = [ cur.strip() for cur in ' '.join(cur_text).split('<sil>') ] united_phrase_for_rnnmorph = [] for phonetic_phrase in list_of_phonetic_phrases: if len(phonetic_phrase) > 0: united_phrase_for_rnnmorph += phonetic_phrase.split() if len(united_phrase_for_rnnmorph) > 0: all_phrases_for_rnnmorph.append(united_phrase_for_rnnmorph) all_phonetic_phrases.append(list_of_phonetic_phrases) else: all_phonetic_phrases.append([]) if len(all_phrases_for_rnnmorph) > 0: all_forms = self.predictor.predict_sentences(all_phrases_for_rnnmorph, \ batch_size=self.batch_size) else: all_forms = [] all_words_and_tags = [] phrase_ind = 0 for cur in all_phonetic_phrases: words_and_tags = [['<sil>', 'SIL _']] if len(cur) > 0: token_ind = 0 for phonetic_phrase in cur: if len(phonetic_phrase) > 0: n = len(phonetic_phrase.split(' ')) analysis = all_forms[phrase_ind][token_ind:(token_ind + n)] for word in analysis: word_and_tag = [] word_and_tag.append(word.word) word_and_tag.append(word.pos + ' ' + word.tag) words_and_tags.append(word_and_tag) words_and_tags.append(['<sil>', 'SIL _']) token_ind += n phrase_ind += 1 all_words_and_tags.append(words_and_tags) return all_words_and_tags def __call__(self, texts: str): """Call the instance like function. Use in pipelines, too.""" return self.preprocessing(texts)[0] def preprocessing(self, texts: str): """[summary] Args: texts (str): Text to preprocess. Returns: list: A list of processed words and tags. """ def prepare(text: str) -> str: """Replace punctuation marks with <sil> tag; remove special symbols.""" text = sub(r'[\.\,\?\!\(\);:]+', ' <sil>', text.lower()) text = sub(r' [–-] |\n', ' <sil> ', text) text = sub(r'\s{2,}', ' ', text) text = sub(r'^\s|(?<!\w)[\\\/@#~¬`£€\$%\^\&\*–_=+\'\"\|«»–-]+', '', text) return text.strip().split(' ') return self.gettags([prepare(cur) for cur in texts])
from rnnmorph.predictor import RNNMorphPredictor from pprint import pprint if __name__ == '__main__': pr = RNNMorphPredictor(language='ru') forms = pr.predict(words=['мама', 'мыла', 'раму']) for i in forms: print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag)) forms = pr.predict_sentences(sentences=[['Всем', 'привет']]) for i in forms[0]: print('{:<15} {:<10} {}'.format(i.normal_form, i.pos, i.tag)) pprint(forms)
for tt1, tt2 in zip(t1, t2) if tt1[1] == tt2[1])) num_pred = sum(1 for s in test_sents for _ in s) print(f"{true_pred / num_pred * 100:.1f}") # In[34]: get_ipython().run_cell_magic('capture', '', '!pip install -q rnnmorph') # In[35]: from rnnmorph.predictor import RNNMorphPredictor predictor = RNNMorphPredictor(language="en") # In[36]: rnnmorph_result = predictor.predict_sentences( [list(map(lambda t: t[0], s)) for s in test_sents]) # In[37]: true_pred = sum((1 for t1, t2 in zip(rnnmorph_result, test_sents) for tt1, tt2 in zip(t1, t2) if tt1.pos == tt2[1])) num_pred = sum(1 for s in test_sents for _ in s) print(f"{true_pred / num_pred * 100:.1f}") # ### Вопрос 7: # * Какое качество вы получили, используя каждую из двух библиотек? Сравните их результаты. # # * Качество с библиотекой rnnmorph должно быть хуже, так как там используется немного другая система тэгов. Какие здесь отличия? # >nltk_model = 89.2, rnnmorph_model = 63.2 #