def get_extractors(cls): extractors = getattr(cls, "_extractors", None) if not extractors: morph_vocab = cls.get_morph_vocab() extractors = [ DatesExtractor(morph_vocab), MoneyExtractor(morph_vocab) ] cls._extractors = extractors return extractors
def parse_cash(self): """ Парсер Денежной суммы :return: """ extractor = MoneyExtractor() matches = extractor(self._text) return [{ 'integer': _.fact.as_json.get('integer'), 'coins': _.fact.as_json.get('coins'), 'fraction': _.fact.as_json.get('fraction'), 'multiplier': _.fact.as_json.get('multiplier'), 'currency': _.fact.as_json.get('currency'), } for _ in matches]
def get_extractor(extract_type): if extract_type == "name": #Экстрактор имён return NamesExtractor() elif extract_type == "location": #Экстрактор мест return LocationExtractor() elif extract_type == "date": #Экстрактор дат return DatesExtractor() elif extract_type == "money": #Экстрактор денежных сумм return MoneyExtractor()
def natasha_res(text): names_extractor = NamesExtractor() location_extractor = LocationExtractor() organisation_extractor = OrganisationExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() names_mapper = lambda x: text[x.span[0]:x.span[1]] location_mapper = names_mapper org_mapper = names_mapper money_mapper = names_mapper res = { 'names': set(map(names_mapper, names_extractor(text))), 'locations': set(map(location_mapper, location_extractor(text))), 'organisations': set(map(org_mapper, organisation_extractor(text))), 'dates': set(map(dates_mapper, dates_extractor(text))), 'money': set(map(money_mapper, money_extractor(text))), } return res
def __init__(self): from natasha import (NamesExtractor, SimpleNamesExtractor, DatesExtractor, MoneyExtractor, MoneyRateExtractor, MoneyRangeExtractor, LocationExtractor, AddressExtractor, OrganisationExtractor, PersonExtractor) addr_ex = AddressExtractor() date_ex = DatesExtractor() loc_ex = LocationExtractor() money_ex = MoneyExtractor() money_range_ex = MoneyRangeExtractor() money_rate_ex = MoneyRateExtractor() name_ex = SimpleNamesExtractor() org_ex = OrganisationExtractor() person_ex = PersonExtractor() # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, # name_ex, org_ex, person_ex] self.extractors = [ addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, org_ex, person_ex ]
def getSalary(text): if text == None: return {} salary = searchSalary4(text) if salary != None: return salary extractorRate = MoneyRateExtractor() matchesRate = extractorRate( text) #todo check why matchesRate[0] can be None if len(matchesRate ) != 0 and matchesRate[0] != None and 'fact' in matchesRate[ 0] and matchesRate[0].fact != None: return { "value": matchesRate[0].fact.money.integer, "currency": matchesRate[0].fact.money.currency, "period": matchesRate[0].fact.period } salary = searchSalary2(text) if salary != None: return salary salary = searchSalary5(text) if salary != None: return salary salary = searchSalary3(text) if salary != None: return salary extractor = MoneyExtractor() matches = extractor(text) if len(matches) != 0: return { "value": matches[0].fact.integer, "currency": matches[0].fact.currency } salary = searchSalary1(text)
def send_db_pool_map(doc): extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] pool_local = ThreadPool(10) ne_full = [] span_ne_full = [] type_ne_full = [] try: for extr in extractors: global text_paper text_paper = doc matches = extr(text_paper) ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))])) span_ne = pool_local.map(get_span_ne, matches) type_ne = pool_local.map(get_type_ne, matches) ne_full.append(ne) span_ne_full.append(span_ne) type_ne_full.append(type_ne) except: print('Ошибка! Примерный номер =', '?') pool_local.close() pool_local.join() if len(ne_full) != 0: ne_for_db = reduce(lambda x, y: x + y, ne_full) span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full) type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full) '''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0]
import myextractors status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) text = 'Посол Израиля на Украине Йоэль Лион признался, что пришел в шок, узнав о решении властей Львовской области объявить 2019 год годом лидера запрещенной в России Организации украинских националистов (ОУН) Степана Бандеры...' docType = 'coast' doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) #для судебного приказа
from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, LocationExtractor) app = Flask(__name__) # todo: LocationExtractor работает плохо, его надо настравиать, # todo: но он умеет находить города, страны и регионы. # todo: AddressExtractor лучше находит и представляет города, # todo: но не находит страны и регионы. Он находит улицы и дома, # todo: нужно исключить их из выдачи и объединить результаты с # todo: результатами LocationExtractor-а. names_extractor = NamesExtractor() address_extractor = AddressExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() location_extractor = LocationExtractor() def find_named_entities(ner_extractor, text): """Находит именованные сущности в тексте. :param ner_extractor: объект класса NamesExtractor, AddressExtractor, DatesExtractor или MoneyExtractor :param text: str :return: list of namedtuples """ matches = ner_extractor(text) return [_.fact.as_json for _ in matches]
'''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0] if __name__ == '__main__': time_begin = time() # экстракторы extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] send_to_db_news_rbc(extractors) # 292.92 секунды # ошибки: 6911;7168;7561;8246;8539;8691;9211 exit() '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1", port="5432", ) cur = con.cursor() pool = ThreadPool(10) # было ошибок 8 - 2459? 2779 = [] for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал # обработало 5839 строк, из них 120 строк не обработаных cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,)) data = cur.fetchall() docs = [x[0] for x in data] #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)])) new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re import json from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] from flask import Flask from flask import request app = Flask(__name__) @app.route('/getFacts', methods=['POST']) def getFacts(): print(request.is_json) content = request.get_json() text = content['text'] facts = {}
def extractor(): return MoneyExtractor()
def money_extractor(morph_vocab): return MoneyExtractor(morph_vocab)
'КАС', re.compile( r'кодекс[а-я]*\s+(об\s+)?административн[а-я]*\s+правонарушени[а-я]*', re.IGNORECASE | re.MULTILINE): 'КоАП', } abbrs = [ (make_regex(r'обществ\w+ с ограниченной ответственностью'), 'ООО'), (make_regex(r'открыт\w+ акционерн\w+ обществ\w+'), 'ОАО'), (make_regex(r'закрыт\w+ акционерн\w+ обществ\w+'), 'ЗАО'), (make_regex(r'публичн\w+ акционерн\w+ обществ\w+'), 'ПАО'), (make_regex(r'акционерн\w+ обществ\w+'), 'АО'), (make_regex(r'федеральн\w+ казенн\w+ учрежден\w+'), 'ФКУ'), ] money = MoneyExtractor() dates = DatesExtractor() org = OrganisationExtractor() CAP_SPACES = re.compile(r'\s((?:[А-Я]\s+){2,}[А-Я][^\w])', re.IGNORECASE) def fix_cap_spaces(text: str): return CAP_SPACES.sub(lambda m: ' ' + m.group(1).replace(' ', '') + ' ', text) def remove_newlines(text: str): regex = re.compile(r'([а-яА-Я,"«»()0-9])\s*\n+', re.MULTILINE) return regex.sub(r'\1 ', text)
def Main(docType, text): status = 1 res = {} segmenter = Segmenter() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) morph_vocab = MorphVocab() names_extractor = NamesExtractor(morph_vocab) money_extractor = MoneyExtractor(morph_vocab) doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) doc.tag_ner(ner_tagger) for span in doc.spans: span.normalize(morph_vocab) #для судебного приказа if docType == 'coast': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер судебного приказа y = myextractors.findNCOASTCASE(text) if y: res['номер судебного приказа'] = y else: status = 0 #дата с п y = myextractors.findDATECOAST(text) if y: res['дата судебного приказа'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #для письма if docType == 'mail': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findDATECONT(text) if y: res['дата договора'] = y else: status = 0 #для платежного поручения if docType == 'order': #фио for span in doc.spans: if span.type == PER: span.extract_fact(names_extractor) x = [_.fact.as_dict for _ in doc.spans if _.type == PER] if x: res['ФИО'] = x else: status = 0 #инн y = myextractors.findINN(text) if y: res['ИНН'] = y else: status = 0 #организации y = [] for span in doc.spans: if span.type == ORG: d = {} d['name'] = span.text y = y + [d] if y: res['организации'] = y else: status = 0 #номер дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #дата дог y = myextractors.findNCONTRACT(text) if y: res['номер договора'] = y else: status = 0 #сумма matches = list(money_extractor(text)) y = [_.fact for _ in matches] ret = [] for i in y: z = {} z['amount'] = i.amount z['currency'] = i.currency ret = ret + [z] if ret: res['сумма'] = ret else: status = 0 returning = {} if status == 1: returning['status'] = 'успех' else: returning['status'] = 'не успех' returning['entities'] = res return returning
STOP_WORDS += ["не_" + x for x in STOP_WORDS] STOP_WORDS += ["не"] # местоимение-существительное, предлог, союз, частица, междометие STOP_POS = ["NPRO", "PREP", "CONJ", "PRCL", "INTJ"] CHAR_TABLE = str.maketrans({ key: " " for key in string.punctuation.replace("<", "").replace(">", "") + "…?‘«»‘♂️”“’[]'™" }) NERS = [ (DatesExtractor(), "<date>"), (TimeExtractor(), "<time>"), (ExtraDatesExtractor(), "<date>"), (MoneyExtractor(), "<money>"), (PhoneExtractor(), "<phone>"), (PhotoExtractor(), "<photo>"), (StickerExtractor(), "<sticker>"), (LinkExtractor(), "<url>"), (AddressExtractor(), "<address>"), (NamesExtractor(), "<name>"), (NumberExtractor(), "<number>"), (CensorExtractor(), "<censored>"), ] class EntityExtractor(Component): """Распознает именованные сущности""" def __init__(self, cache=None, verbose=False, **kwargs): super(EntityExtractor, self).__init__()