def get_locations(text): extractor = LocationExtractor() matches = list(filter(lambda m: m.fact.name != "россия", extractor(text))) locations = list(map(lambda m: m.fact.name, matches)) uniq = [] for k in locations: uniq = uniq + k.split() return list(set(uniq))
def get_location(query): """Получение города из реплики. """ loc_extractor = LocationExtractor() city = loc_extractor(query) try: fact_city = city[0].fact.as_json except IndexError: raise LocationExtractorException("Location Extractor Error") loc = format_json(fact_city) loc = re.findall(r"\: \"(\w+)", loc) return loc
def get_extractor(extract_type): if extract_type == "name": #Экстрактор имён return NamesExtractor() elif extract_type == "location": #Экстрактор мест return LocationExtractor() elif extract_type == "date": #Экстрактор дат return DatesExtractor() elif extract_type == "money": #Экстрактор денежных сумм return MoneyExtractor()
def add_place(all_news): extractor = LocationExtractor() for i in range(len(all_news)): text = ' '.join([all_news[i]['title'], all_news[i]['article']]) matches = extractor(text) all_news[i].update({'place': ''}) if len(matches.as_json) > 0: places = [] for match in matches.as_json: places.append(match['fact']['name']) for j in range(len(places)): places[j] = places[j].title() all_news[i].update({'place': ','.join(places)}) return all_news
def natasha_res(text): names_extractor = NamesExtractor() location_extractor = LocationExtractor() organisation_extractor = OrganisationExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() names_mapper = lambda x: text[x.span[0]:x.span[1]] location_mapper = names_mapper org_mapper = names_mapper money_mapper = names_mapper res = { 'names': set(map(names_mapper, names_extractor(text))), 'locations': set(map(location_mapper, location_extractor(text))), 'organisations': set(map(org_mapper, organisation_extractor(text))), 'dates': set(map(dates_mapper, dates_extractor(text))), 'money': set(map(money_mapper, money_extractor(text))), } return res
def __init__(self): from natasha import (NamesExtractor, SimpleNamesExtractor, DatesExtractor, MoneyExtractor, MoneyRateExtractor, MoneyRangeExtractor, LocationExtractor, AddressExtractor, OrganisationExtractor, PersonExtractor) addr_ex = AddressExtractor() date_ex = DatesExtractor() loc_ex = LocationExtractor() money_ex = MoneyExtractor() money_range_ex = MoneyRangeExtractor() money_rate_ex = MoneyRateExtractor() name_ex = SimpleNamesExtractor() org_ex = OrganisationExtractor() person_ex = PersonExtractor() # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, # name_ex, org_ex, person_ex] self.extractors = [ addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, org_ex, person_ex ]
def send_db_pool_map(doc): extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] pool_local = ThreadPool(10) ne_full = [] span_ne_full = [] type_ne_full = [] try: for extr in extractors: global text_paper text_paper = doc matches = extr(text_paper) ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))])) span_ne = pool_local.map(get_span_ne, matches) type_ne = pool_local.map(get_type_ne, matches) ne_full.append(ne) span_ne_full.append(span_ne) type_ne_full.append(type_ne) except: print('Ошибка! Примерный номер =', '?') pool_local.close() pool_local.join() if len(ne_full) != 0: ne_for_db = reduce(lambda x, y: x + y, ne_full) span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full) type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full) '''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0]
MoneyExtractor, LocationExtractor) app = Flask(__name__) # todo: LocationExtractor работает плохо, его надо настравиать, # todo: но он умеет находить города, страны и регионы. # todo: AddressExtractor лучше находит и представляет города, # todo: но не находит страны и регионы. Он находит улицы и дома, # todo: нужно исключить их из выдачи и объединить результаты с # todo: результатами LocationExtractor-а. names_extractor = NamesExtractor() address_extractor = AddressExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() location_extractor = LocationExtractor() def find_named_entities(ner_extractor, text): """Находит именованные сущности в тексте. :param ner_extractor: объект класса NamesExtractor, AddressExtractor, DatesExtractor или MoneyExtractor :param text: str :return: list of namedtuples """ matches = ner_extractor(text) return [_.fact.as_json for _ in matches] def get_response(text):
'''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0] if __name__ == '__main__': time_begin = time() # экстракторы extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] send_to_db_news_rbc(extractors) # 292.92 секунды # ошибки: 6911;7168;7561;8246;8539;8691;9211 exit() '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1", port="5432", ) cur = con.cursor() pool = ThreadPool(10) # было ошибок 8 - 2459? 2779 = [] for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал # обработало 5839 строк, из них 120 строк не обработаных cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,)) data = cur.fetchall() docs = [x[0] for x in data] #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)])) new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re import json from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] from flask import Flask from flask import request app = Flask(__name__) @app.route('/getFacts', methods=['POST']) def getFacts(): print(request.is_json) content = request.get_json() text = content['text'] facts = {}
''' Если ничего не будет работать, это нас спасет ''' from natasha import NamesExtractor from natasha import LocationExtractor from natasha import OrganisationExtractor from natasha import DatesExtractor from natasha import AddressExtractor names_extr = NamesExtractor() locs_extr = LocationExtractor() org_extr = OrganisationExtractor() dates_extr = DatesExtractor() address_extr = AddressExtractor() def recognize_names(text): tmp = text matches = names_extr(text) for match in matches: start, finish = match.span tmp = tmp.replace(text[start:finish], "[NAME]") return tmp def recognize_locs(text): tmp = text matches = locs_extr(text) for match in matches: start, finish = match.span
def get_Locations(text): extractor = LocationExtractor() matches = extractor(text) return list(map(lambda m: Location(m.fact.name), matches))
def extractor(): return LocationExtractor()