def __tag_text(text): doc = Doc(text) doc.segment(Segmenter()) ner_tagger = NewsNERTagger(NewsEmbedding()) doc.tag_ner(ner_tagger) return doc
def __init__(self): self.morph = pymorphy2.MorphAnalyzer() self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.ner_tagger = NewsNERTagger(self.emb)
def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb)
def get_ner_tagger(cls): ner_tagger = getattr(cls, "_ner_tagger", None) if not ner_tagger: embedding = cls.get_embedding() ner_tagger = NewsNERTagger(embedding) cls._ner_tagger = ner_tagger return ner_tagger
def __init__(self): self.segmenter = Segmenter() self.morph_vocab = MorphVocab() self.emb = NewsEmbedding() self.morph_tagger = NewsMorphTagger(self.emb) self.syntax_parser = NewsSyntaxParser(self.emb) self.ner_tagger = NewsNERTagger(self.emb) self.names_extractor = NamesExtractor(self.morph_vocab) self.doc = [] self.term_extractor = TermExtractor()
def __init__(self, text): self.doc = Doc(text) self.doc.segment(Segmenter()) self.doc.tag_morph(NewsMorphTagger(NewsEmbedding())) morph_vocab = MorphVocab() for token in self.doc.tokens: token.lemmatize(morph_vocab) self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding())) self.doc.tag_ner(NewsNERTagger(NewsEmbedding())) for span in self.doc.spans: span.normalize(morph_vocab) self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens)) self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens)) self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens)) self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
from natasha import (Segmenter, NewsEmbedding, NewsMorphTagger, NewsSyntaxParser, NewsNERTagger, MorphVocab, PER, ORG, NamesExtractor, MoneyExtractor, Doc) import myextractors status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...' docType = 'coast' doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans:
def ner_tagger(embedding): return NewsNERTagger(embedding)
NEWS_MORPH = os.path.join(sys._MEIPASS, "slovnet_morph_news_v1.tar") NEWS_SYNTAX = os.path.join(sys._MEIPASS, "slovnet_syntax_news_v1.tar") NEWS_NER = os.path.join(sys._MEIPASS, "slovnet_ner_news_v1.tar") DICTS = os.path.join(sys._MEIPASS, "dicts") else: NEWS_EMBEDDING = os.path.join("navec_news_v1_1B_250K_300d_100q.tar") NEWS_MORPH = os.path.join("slovnet_morph_news_v1.tar") NEWS_SYNTAX = os.path.join("slovnet_syntax_news_v1.tar") NEWS_NER = os.path.join("slovnet_ner_news_v1.tar") DICTS = "dicts" emb = NewsEmbedding(path=NEWS_EMBEDDING) morph_tagger = NewsMorphTagger(emb, path=NEWS_MORPH) segmenter = Segmenter() syntax_parser = NewsSyntaxParser(emb, path=NEWS_SYNTAX) ner_tagger = NewsNERTagger(emb, path=NEWS_NER) NARRATOR = -1 DETPRON = { "Fem": { '3': ["ее", "её"], '1': [ 'мой', 'моя', 'моё', 'мое', 'мои', 'моего', 'моей', 'моих', 'моему', 'моим', 'мою', 'моим', 'моею', 'моими', 'моем', 'моём' ] }, "Masc": { '3': ['его'], '1': [ 'мой', 'моя', 'моё', 'мое', 'мои', 'моего', 'моей', 'моих', 'моему', 'моим', 'мою', 'моим', 'моею', 'моими', 'моем', 'моём'
def Main(docType, text): status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) #для судебного приказа if docType == 'coast': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер судебного приказа y = myextractors.findNCOASTCASE(text) if y: res['номер судебного приказа'] = y else: status = 0 #дата с п y = myextractors.findDATECOAST(text) if y: res['дата судебного приказа'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #для письма if docType == 'mail': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findDATECONT(text) if y: res['дата договора'] = y else: status = 0 #для платежного поручения if docType == 'order': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #сумма matches = list(money_extractor(text)) y = [_.fact for _ in matches] ret = [] for i in y: z = {} z['amount'] = i.amount z['currency'] = i.currency ret = ret + [z] if ret: res['сумма'] = ret else: status = 0 returning = {} if status == 1: returning['status'] = 'успех' else: returning['status'] = 'не успех' returning['entities'] = res return returning