Esempio n. 1
0
def __tag_text(text):
    doc = Doc(text)
    doc.segment(Segmenter())

    ner_tagger = NewsNERTagger(NewsEmbedding())
    doc.tag_ner(ner_tagger)
    return doc
Esempio n. 2
0
 def __init__(self):
     self.morph = pymorphy2.MorphAnalyzer()
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
Esempio n. 3
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
Esempio n. 4
0
 def get_ner_tagger(cls):
     ner_tagger = getattr(cls, "_ner_tagger", None)
     if not ner_tagger:
         embedding = cls.get_embedding()
         ner_tagger = NewsNERTagger(embedding)
         cls._ner_tagger = ner_tagger
     return ner_tagger
Esempio n. 5
0
 def __init__(self):
     self.segmenter = Segmenter()
     self.morph_vocab = MorphVocab()
     self.emb = NewsEmbedding()
     self.morph_tagger = NewsMorphTagger(self.emb)
     self.syntax_parser = NewsSyntaxParser(self.emb)
     self.ner_tagger = NewsNERTagger(self.emb)
     self.names_extractor = NamesExtractor(self.morph_vocab)
     self.doc = []
     self.term_extractor = TermExtractor()
Esempio n. 6
0
 def __init__(self, text):
     self.doc = Doc(text)
     self.doc.segment(Segmenter())
     self.doc.tag_morph(NewsMorphTagger(NewsEmbedding()))
     morph_vocab = MorphVocab()
     for token in self.doc.tokens:
         token.lemmatize(morph_vocab)
     self.doc.parse_syntax(NewsSyntaxParser(NewsEmbedding()))
     self.doc.tag_ner(NewsNERTagger(NewsEmbedding()))
     for span in self.doc.spans:
         span.normalize(morph_vocab)
     self.words = tuple(filter(lambda x: x.pos not in ('X', 'PUNCT'), self.doc.tokens))
     self.tokens_nouns = tuple(filter(lambda t: t.pos in ['NOUN', 'PROPN'], self.doc.tokens))
     self.tokens_adjs = tuple(filter(lambda t: t.pos == 'ADJ', self.doc.tokens))
     self.tokens_verbs = tuple(filter(lambda t: t.pos == 'VERB', self.doc.tokens))
Esempio n. 7
0
from natasha import (Segmenter, NewsEmbedding, NewsMorphTagger,
                     NewsSyntaxParser, NewsNERTagger, MorphVocab, PER, ORG,
                     NamesExtractor, MoneyExtractor, Doc)

import myextractors

status = 1
res = {}

segmenter = Segmenter()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

names_extractor = NamesExtractor(morph_vocab)
money_extractor = MoneyExtractor(morph_vocab)

text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...'

docType = 'coast'

doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)

for span in doc.spans:
Esempio n. 8
0
def ner_tagger(embedding):
    return NewsNERTagger(embedding)
Esempio n. 9
0
    NEWS_MORPH = os.path.join(sys._MEIPASS, "slovnet_morph_news_v1.tar")
    NEWS_SYNTAX = os.path.join(sys._MEIPASS, "slovnet_syntax_news_v1.tar")
    NEWS_NER = os.path.join(sys._MEIPASS, "slovnet_ner_news_v1.tar")
    DICTS = os.path.join(sys._MEIPASS, "dicts")
else:
    NEWS_EMBEDDING = os.path.join("navec_news_v1_1B_250K_300d_100q.tar")
    NEWS_MORPH = os.path.join("slovnet_morph_news_v1.tar")
    NEWS_SYNTAX = os.path.join("slovnet_syntax_news_v1.tar")
    NEWS_NER = os.path.join("slovnet_ner_news_v1.tar")
    DICTS = "dicts"

emb = NewsEmbedding(path=NEWS_EMBEDDING)
morph_tagger = NewsMorphTagger(emb, path=NEWS_MORPH)
segmenter = Segmenter()
syntax_parser = NewsSyntaxParser(emb, path=NEWS_SYNTAX)
ner_tagger = NewsNERTagger(emb, path=NEWS_NER)
NARRATOR = -1

DETPRON = {
    "Fem": {
        '3': ["ее", "её"],
        '1': [
            'мой', 'моя', 'моё', 'мое', 'мои', 'моего', 'моей', 'моих',
            'моему', 'моим', 'мою', 'моим', 'моею', 'моими', 'моем', 'моём'
        ]
    },
    "Masc": {
        '3': ['его'],
        '1': [
            'мой', 'моя', 'моё', 'мое', 'мои', 'моего', 'моей', 'моих',
            'моему', 'моим', 'мою', 'моим', 'моею', 'моими', 'моем', 'моём'
Esempio n. 10
0
def Main(docType, text):
    status = 1
    res = {}

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()

    names_extractor = NamesExtractor(morph_vocab)
    money_extractor = MoneyExtractor(morph_vocab)

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)

    #для судебного приказа
    if docType == 'coast':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер судебного приказа
        y = myextractors.findNCOASTCASE(text)
        if y:
            res['номер судебного приказа'] = y
        else:
            status = 0
        #дата с п
        y = myextractors.findDATECOAST(text)
        if y:
            res['дата судебного приказа'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0

    #для письма
    if docType == 'mail':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findDATECONT(text)
        if y:
            res['дата договора'] = y
        else:
            status = 0

    #для платежного поручения
    if docType == 'order':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #сумма
        matches = list(money_extractor(text))
        y = [_.fact for _ in matches]
        ret = []
        for i in y:
            z = {}
            z['amount'] = i.amount
            z['currency'] = i.currency
            ret = ret + [z]
        if ret:
            res['сумма'] = ret
        else:
            status = 0

    returning = {}

    if status == 1:
        returning['status'] = 'успех'
    else:
        returning['status'] = 'не успех'

    returning['entities'] = res
    return returning