Beispiel #1
0
 def get_extractors(cls):
     extractors = getattr(cls, "_extractors", None)
     if not extractors:
         morph_vocab = cls.get_morph_vocab()
         extractors = [
             DatesExtractor(morph_vocab),
             MoneyExtractor(morph_vocab)
         ]
         cls._extractors = extractors
     return extractors
 def parse_cash(self):
     """ Парсер Денежной суммы
     :return:
     """
     extractor = MoneyExtractor()
     matches = extractor(self._text)
     return [{
         'integer': _.fact.as_json.get('integer'),
         'coins': _.fact.as_json.get('coins'),
         'fraction': _.fact.as_json.get('fraction'),
         'multiplier': _.fact.as_json.get('multiplier'),
         'currency': _.fact.as_json.get('currency'),
     } for _ in matches]
Beispiel #3
0
def get_extractor(extract_type):
    if extract_type == "name":
        #Экстрактор имён
        return NamesExtractor()
    elif extract_type == "location":
        #Экстрактор мест
        return LocationExtractor()
    elif extract_type == "date":
        #Экстрактор дат
        return DatesExtractor()
    elif extract_type == "money":
        #Экстрактор денежных сумм
        return MoneyExtractor()
Beispiel #4
0
def natasha_res(text):
    names_extractor = NamesExtractor()
    location_extractor = LocationExtractor()
    organisation_extractor = OrganisationExtractor()
    dates_extractor = DatesExtractor()
    money_extractor = MoneyExtractor()
    names_mapper = lambda x: text[x.span[0]:x.span[1]]
    location_mapper = names_mapper
    org_mapper = names_mapper
    money_mapper = names_mapper
    res = {
        'names': set(map(names_mapper, names_extractor(text))),
        'locations': set(map(location_mapper, location_extractor(text))),
        'organisations': set(map(org_mapper, organisation_extractor(text))),
        'dates': set(map(dates_mapper, dates_extractor(text))),
        'money': set(map(money_mapper, money_extractor(text))),
    }
    return res
    def __init__(self):
        from natasha import (NamesExtractor, SimpleNamesExtractor,
                             DatesExtractor, MoneyExtractor,
                             MoneyRateExtractor, MoneyRangeExtractor,
                             LocationExtractor, AddressExtractor,
                             OrganisationExtractor, PersonExtractor)

        addr_ex = AddressExtractor()
        date_ex = DatesExtractor()
        loc_ex = LocationExtractor()
        money_ex = MoneyExtractor()
        money_range_ex = MoneyRangeExtractor()
        money_rate_ex = MoneyRateExtractor()
        name_ex = SimpleNamesExtractor()
        org_ex = OrganisationExtractor()
        person_ex = PersonExtractor()
        # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
        #            name_ex, org_ex, person_ex]
        self.extractors = [
            addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
            org_ex, person_ex
        ]
Beispiel #6
0
def getSalary(text):
    if text == None:
        return {}

    salary = searchSalary4(text)
    if salary != None:
        return salary

    extractorRate = MoneyRateExtractor()
    matchesRate = extractorRate(
        text)  #todo check why matchesRate[0] can be None
    if len(matchesRate
           ) != 0 and matchesRate[0] != None and 'fact' in matchesRate[
               0] and matchesRate[0].fact != None:
        return {
            "value": matchesRate[0].fact.money.integer,
            "currency": matchesRate[0].fact.money.currency,
            "period": matchesRate[0].fact.period
        }
    salary = searchSalary2(text)
    if salary != None:
        return salary
    salary = searchSalary5(text)
    if salary != None:
        return salary
    salary = searchSalary3(text)
    if salary != None:
        return salary

    extractor = MoneyExtractor()
    matches = extractor(text)

    if len(matches) != 0:
        return {
            "value": matches[0].fact.integer,
            "currency": matches[0].fact.currency
        }

    salary = searchSalary1(text)
Beispiel #7
0
def send_db_pool_map(doc):
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]

    pool_local = ThreadPool(10)
    ne_full = []
    span_ne_full = []
    type_ne_full = []
    try:
        for extr in extractors:
            global text_paper
            text_paper = doc
            matches = extr(text_paper)
            ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))]))
            span_ne = pool_local.map(get_span_ne, matches)
            type_ne = pool_local.map(get_type_ne, matches)

            ne_full.append(ne)
            span_ne_full.append(span_ne)
            type_ne_full.append(type_ne)
    except:
        print('Ошибка! Примерный номер =', '?')
    pool_local.close()
    pool_local.join()
    if len(ne_full) != 0:
        ne_for_db = reduce(lambda x, y: x + y, ne_full)
        span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full)
        type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full)
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]
Beispiel #8
0
import myextractors

status = 1
res = {}

segmenter = Segmenter()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

names_extractor = NamesExtractor(morph_vocab)
money_extractor = MoneyExtractor(morph_vocab)

text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...'

docType = 'coast'

doc = Doc(text)
doc.segment(segmenter)
doc.tag_morph(morph_tagger)
doc.parse_syntax(syntax_parser)
doc.tag_ner(ner_tagger)

for span in doc.spans:
    span.normalize(morph_vocab)

#для судебного приказа
Beispiel #9
0
from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, LocationExtractor)

app = Flask(__name__)

# todo: LocationExtractor работает плохо, его надо настравиать,
# todo: но он умеет находить города, страны и регионы.
# todo: AddressExtractor лучше находит и представляет города,
# todo: но не находит страны и регионы. Он находит улицы и дома,
# todo: нужно исключить их из выдачи и объединить результаты с
# todo: результатами LocationExtractor-а.

names_extractor = NamesExtractor()
address_extractor = AddressExtractor()
dates_extractor = DatesExtractor()
money_extractor = MoneyExtractor()
location_extractor = LocationExtractor()


def find_named_entities(ner_extractor, text):
    """Находит именованные сущности в тексте.

    :param ner_extractor: объект класса NamesExtractor, AddressExtractor,
    DatesExtractor или MoneyExtractor
    :param text: str
    :return: list of namedtuples
    """
    matches = ner_extractor(text)
    return [_.fact.as_json for _ in matches]

Beispiel #10
0
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]



if __name__ == '__main__':
    time_begin = time()
    # экстракторы
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]
    send_to_db_news_rbc(extractors) # 292.92 секунды
    # ошибки: 6911;7168;7561;8246;8539;8691;9211
    exit()
    '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1",
                           port="5432", )
    cur = con.cursor()
    pool = ThreadPool(10) # было ошибок 8  - 2459? 2779 = []
    for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал
        # обработало 5839 строк, из них 120 строк не обработаных
        cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,))
        data = cur.fetchall()
        docs = [x[0] for x in data]
        #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)]))
        new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
Beispiel #11
0
from ipymarkup import AsciiMarkup, Span, BoxMarkup
import re
import json

from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, OrganisationExtractor, LocationExtractor)
from natasha.markup import show_markup, show_json

extractors = [
    NamesExtractor(),
    AddressExtractor(),
    DatesExtractor(),
    MoneyExtractor(),
    OrganisationExtractor(),
    LocationExtractor()
]

from flask import Flask
from flask import request

app = Flask(__name__)


@app.route('/getFacts', methods=['POST'])
def getFacts():
    print(request.is_json)
    content = request.get_json()

    text = content['text']

    facts = {}
Beispiel #12
0
def extractor():
    return MoneyExtractor()
def money_extractor(morph_vocab):
    return MoneyExtractor(morph_vocab)
Beispiel #14
0
    'КАС',
    re.compile(
        r'кодекс[а-я]*\s+(об\s+)?административн[а-я]*\s+правонарушени[а-я]*', re.IGNORECASE | re.MULTILINE):
    'КоАП',
}

abbrs = [
    (make_regex(r'обществ\w+ с ограниченной ответственностью'), 'ООО'),
    (make_regex(r'открыт\w+ акционерн\w+ обществ\w+'), 'ОАО'),
    (make_regex(r'закрыт\w+ акционерн\w+ обществ\w+'), 'ЗАО'),
    (make_regex(r'публичн\w+ акционерн\w+ обществ\w+'), 'ПАО'),
    (make_regex(r'акционерн\w+ обществ\w+'), 'АО'),
    (make_regex(r'федеральн\w+ казенн\w+ учрежден\w+'), 'ФКУ'),
]

money = MoneyExtractor()
dates = DatesExtractor()
org = OrganisationExtractor()

CAP_SPACES = re.compile(r'\s((?:[А-Я]\s+){2,}[А-Я][^\w])', re.IGNORECASE)


def fix_cap_spaces(text: str):
    return CAP_SPACES.sub(lambda m: ' ' + m.group(1).replace(' ', '') + ' ',
                          text)


def remove_newlines(text: str):
    regex = re.compile(r'([а-яА-Я,"«»()0-9])\s*\n+', re.MULTILINE)
    return regex.sub(r'\1 ', text)
Beispiel #15
0
def Main(docType, text):
    status = 1
    res = {}

    segmenter = Segmenter()

    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    syntax_parser = NewsSyntaxParser(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()

    names_extractor = NamesExtractor(morph_vocab)
    money_extractor = MoneyExtractor(morph_vocab)

    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)
    doc.tag_ner(ner_tagger)

    for span in doc.spans:
        span.normalize(morph_vocab)

    #для судебного приказа
    if docType == 'coast':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер судебного приказа
        y = myextractors.findNCOASTCASE(text)
        if y:
            res['номер судебного приказа'] = y
        else:
            status = 0
        #дата с п
        y = myextractors.findDATECOAST(text)
        if y:
            res['дата судебного приказа'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0

    #для письма
    if docType == 'mail':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findDATECONT(text)
        if y:
            res['дата договора'] = y
        else:
            status = 0

    #для платежного поручения
    if docType == 'order':
        #фио
        for span in doc.spans:
            if span.type == PER:
                span.extract_fact(names_extractor)
        x = [_.fact.as_dict for _ in doc.spans if _.type == PER]
        if x:
            res['ФИО'] = x
        else:
            status = 0
        #инн
        y = myextractors.findINN(text)
        if y:
            res['ИНН'] = y
        else:
            status = 0
        #организации
        y = []
        for span in doc.spans:
            if span.type == ORG:
                d = {}
                d['name'] = span.text
                y = y + [d]
        if y:
            res['организации'] = y
        else:
            status = 0
        #номер дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #дата дог
        y = myextractors.findNCONTRACT(text)
        if y:
            res['номер договора'] = y
        else:
            status = 0
        #сумма
        matches = list(money_extractor(text))
        y = [_.fact for _ in matches]
        ret = []
        for i in y:
            z = {}
            z['amount'] = i.amount
            z['currency'] = i.currency
            ret = ret + [z]
        if ret:
            res['сумма'] = ret
        else:
            status = 0

    returning = {}

    if status == 1:
        returning['status'] = 'успех'
    else:
        returning['status'] = 'не успех'

    returning['entities'] = res
    return returning
Beispiel #16
0
STOP_WORDS += ["не_" + x for x in STOP_WORDS]
STOP_WORDS += ["не"]

# местоимение-существительное, предлог, союз, частица, междометие
STOP_POS = ["NPRO", "PREP", "CONJ", "PRCL", "INTJ"]

CHAR_TABLE = str.maketrans({
    key: " "
    for key in string.punctuation.replace("<", "").replace(">", "") +
    "…?‘«»‘♂️”“’[]'™"
})
NERS = [
    (DatesExtractor(), "<date>"),
    (TimeExtractor(), "<time>"),
    (ExtraDatesExtractor(), "<date>"),
    (MoneyExtractor(), "<money>"),
    (PhoneExtractor(), "<phone>"),
    (PhotoExtractor(), "<photo>"),
    (StickerExtractor(), "<sticker>"),
    (LinkExtractor(), "<url>"),
    (AddressExtractor(), "<address>"),
    (NamesExtractor(), "<name>"),
    (NumberExtractor(), "<number>"),
    (CensorExtractor(), "<censored>"),
]


class EntityExtractor(Component):
    """Распознает именованные сущности"""
    def __init__(self, cache=None, verbose=False, **kwargs):
        super(EntityExtractor, self).__init__()