def main(): news_sites = { 'm24.ru': M24_accidents, 'mosday.ru': Mosday_accidents, 'vm.ru': VM_accidents } # Инициализируем Наташу extractor = AddressExtractor() # Ищем новости, проверяем на наличие адресов, загружаем во временное хранилище news_list = [] for key in news_sites.keys(): news_list += get_news(news_sites, key, extractor) for item in news_list: if find_address_in_news(item, extractor): #print(item) #print(item['location']['coordinates'][0][0], item['location']['coordinates'][0][1]) record = News(title=item['title'], link=item['link'], date_and_time=datetime.strptime( item['time'] + ' ' + item['date'], '%H:%M %d.%m.%Y'), text=item['text'], address=item['location']['address'], street=item['location']['street'], lat=item['location']['coordinates'][0][0], lon=item['location']['coordinates'][0][1]) db.session.add(record) db.session.commit()
def test_nlp(self, text): lines = text.splitlines() extractor = AddressExtractor() for line in lines: matches = extractor(line) spans = [_.span for _ in matches] facts = [_.fact.as_json for _ in matches] show_markup(line, spans) print(format_json(facts))
def parse_address(self): """ Парсер адреса :return: """ extractor = AddressExtractor() matches = extractor(self._text) address = [] for _ in matches: if _.fact.as_json.get('parts'): row = [] for x in _.fact.as_json.get('parts'): if x.get('name'): row.append(x.get('name')) address.append(', '.join(row)) return address
def main(): news_sites = { 'm24.ru': M24_accidents, 'mosday.ru': Mosday_accidents, 'vm.ru': VM_accidents } # Инициализируем Наташу extractor = AddressExtractor() # Ищем новости, проверяем на наличие адресов, загружаем во временное хранилище news_list = [] for key in news_sites.keys(): news_list += get_news(news_sites, key, extractor) for item in news_list: if find_address_in_news(item, extractor): print(item)
def __init__(self): from natasha import (NamesExtractor, SimpleNamesExtractor, DatesExtractor, MoneyExtractor, MoneyRateExtractor, MoneyRangeExtractor, LocationExtractor, AddressExtractor, OrganisationExtractor, PersonExtractor) addr_ex = AddressExtractor() date_ex = DatesExtractor() loc_ex = LocationExtractor() money_ex = MoneyExtractor() money_range_ex = MoneyRangeExtractor() money_rate_ex = MoneyRateExtractor() name_ex = SimpleNamesExtractor() org_ex = OrganisationExtractor() person_ex = PersonExtractor() # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, # name_ex, org_ex, person_ex] self.extractors = [ addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, org_ex, person_ex ]
def send_db_pool_map(doc): extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] pool_local = ThreadPool(10) ne_full = [] span_ne_full = [] type_ne_full = [] try: for extr in extractors: global text_paper text_paper = doc matches = extr(text_paper) ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))])) span_ne = pool_local.map(get_span_ne, matches) type_ne = pool_local.map(get_type_ne, matches) ne_full.append(ne) span_ne_full.append(span_ne) type_ne_full.append(type_ne) except: print('Ошибка! Примерный номер =', '?') pool_local.close() pool_local.join() if len(ne_full) != 0: ne_for_db = reduce(lambda x, y: x + y, ne_full) span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full) type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full) '''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0]
from flask import Flask, request, jsonify from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, LocationExtractor) app = Flask(__name__) # todo: LocationExtractor работает плохо, его надо настравиать, # todo: но он умеет находить города, страны и регионы. # todo: AddressExtractor лучше находит и представляет города, # todo: но не находит страны и регионы. Он находит улицы и дома, # todo: нужно исключить их из выдачи и объединить результаты с # todo: результатами LocationExtractor-а. names_extractor = NamesExtractor() address_extractor = AddressExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() location_extractor = LocationExtractor() def find_named_entities(ner_extractor, text): """Находит именованные сущности в тексте. :param ner_extractor: объект класса NamesExtractor, AddressExtractor, DatesExtractor или MoneyExtractor :param text: str :return: list of namedtuples """ matches = ner_extractor(text) return [_.fact.as_json for _ in matches]
'''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0] if __name__ == '__main__': time_begin = time() # экстракторы extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] send_to_db_news_rbc(extractors) # 292.92 секунды # ошибки: 6911;7168;7561;8246;8539;8691;9211 exit() '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1", port="5432", ) cur = con.cursor() pool = ThreadPool(10) # было ошибок 8 - 2459? 2779 = [] for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал # обработало 5839 строк, из них 120 строк не обработаных cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,)) data = cur.fetchall() docs = [x[0] for x in data] #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)])) new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
def evaluateInput(input_sentence='', encoder=encoder, decoder=decoder, searcher=searcher, voc=voc): ex = AddressExtractor() line = "найти Санкт-Петербург, улица Федора Абрамова, 9" t = {} mathes = ex(line) for i in range(3): t[type(mathes[0].fact.parts[i])] = i try: if "найти" in input_sentence.lower().lstrip(): ex = AddressExtractor() if ex(input_sentence) and len(ex(input_sentence)) == 1: path = 'https://www.google.ru/maps/place/' for part in ex(input_sentence)[0].fact.parts: flag = t[type(part)] if flag == 2: if part.number != None: if part.type != None: path += part.type + '+' path += part.number + '+' else: if part.name != None: if part.type != None: path += part.type + '+' if len(part.name.split(' ')) > 1: for word in part.name.split(' '): path += word + '+' else: path += part.name + '+' return path[:-1] + '/' else: ex = NamesExtractor() if ex(input_sentence) and len(ex(input_sentence)) == 1: if ex(input_sentence)[0].fact.first != None and ex( input_sentence)[0].fact.last != None: path = f'https://vk.com/search?c%5Bper_page%5D=40&c%5Bphoto%5D=1&c%5Bq%5D={ex(input_sentence)[0].fact.first}%20{ex(input_sentence)[0].fact.last}&c%5Bsection%5D=people' rec = requests.get(path) vk_mask = 'https://vk.com' tree = lxml.html.fromstring(rec.text) links = tree.xpath( '//a[@class="simple_fit_item search_item"]/@href') if links != []: st = '--list' for i in range(len(links)): st += (vk_mask + links[i] + '\n') return st else: return 'По вашему запросу ничего не найдено' else: query = '' for i in range(len(input_sentence) - 5): if input_sentence[i:i + 5].lower( ) == 'найти' and i != len(input_sentence) - 6: query = input_sentence[i + 6:] if query != '': links = list( search(query, tld="co.in", num=10, stop=3, pause=1)) if links != []: st = '--list' for i in range(len(links)): st += (links[i] + '\n') return st else: return 'По вашему запросу ничего не найдено' else: return 'По вашему запросу ничего не найдено' else: input_sentence = proc.normalizeString(input_sentence) output_words = evaluate(encoder, decoder, searcher, voc, input_sentence) output_words[:] = [ x for x in output_words if not (x == 'EOS' or x == 'PAD') ] pos = 0 k = 1 for i in range(len(output_words) - 1): if output_words[i] == output_words[i + 1]: k += 1 pos = i + 1 if k > 2: output_words = output_words[:pos] return ' '.join(output_words) except KeyError: return "Мая твая нипанимать :с"
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re import json from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] from flask import Flask from flask import request app = Flask(__name__) @app.route('/getFacts', methods=['POST']) def getFacts(): print(request.is_json) content = request.get_json() text = content['text'] facts = {}
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re from natasha import ( NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor ) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] addressExtractor = AddressExtractor() class TextInfo(object): def __init__(self, clear_text, addresses): self.clear_text = clear_text self.addresses
def extractor_address_str(text: str) -> List[str]: return [ text.__getitem__(slice(*match.span)) for match in AddressExtractor()(text) ]
def __init__(self): self.address = AddressExtractor() self.org = OrganisationExtractor() self.dates = DatesExtractor()
def textClassificator(text): person = Person() phones = re.findall(r'([+(]?[1-9][0-9 .\-()]{8,}[0-9])|([0-9.\-()]{7,})', text) person.phone = [value for value in np.array(phones).flatten() if value] person.email = re.findall( r'([-._a-z0-9]+@(?:[a-z0-9][-a-z0-9]+\.)+[a-z]{2,6})', text) symb = re.compile(r'(моб|mob|mail|fax|факс|tel|тел\.|телефон|сайт|site)', re.IGNORECASE) symbols = symb.findall(text) t = text.split() site = re.compile(r'(http:\s*//www.\w+.\w+)|(www.\w+.\w+)', re.IGNORECASE) site_list = site.findall(text) person.site = [ re.sub(" ", "", value) for value in np.array(site_list).flatten() if value ] for word in t: for substr in symbols: if substr in word: text = text.replace(word, "") for i in range(len(site_list)): text = text.replace(np.array(site_list).flatten()[i], "") for i in range(len(person.phone)): text = text.replace(person.phone[i], "") for i in range(len(person.email)): text = text.replace(person.email[i], "") text = text.replace('\n', " ") extractor = AddressExtractor() matches = extractor(text) facts = [_.fact.as_json for _ in matches] if facts: address = facts[0]['parts'] address_list = '' for i in range(len(address)): keys = reversed(address[i].keys()) for key in keys: address_list += '' + address[i][key] + ' ' text = re.sub(r'\s' + address[i][key] + '\s|,', " ", text) person.address = address_list symb = re.compile( r'(г\.|город|ул\.|улица|стр\.|строение|д\.|дом|офис|помещение|к\.|корпус)' ) symbols = symb.findall(text) t = text.split() for word in t: for substr in symbols: if substr in word: text = text.replace(word, "") words = text.split() nlf_cl = [] if len(Classifier.classifier[0]): pos_n = 0 pos_s = 0 pos_f = 0 for word in words: if len(word) > 1: nlf_cl.append( classify(Classifier.classifier, get_features(word))) max_name = -np.inf max_surname = -np.inf max_fathername = -np.inf for idx in range(len(nlf_cl)): key, value = nlf_cl[idx] if key == 'Имя': if max_name < value[0]: max_name = value[0] pos_n = idx if key == 'Фамилия': if max_surname < value[0]: max_surname = value[0] pos_s = idx if key == 'Отчество': if max_fathername < value[0]: max_fathername = value[0] pos_f = idx person.name = words[pos_n] person.surname = words[pos_s] person.fathername = words[pos_f] text = re.sub(r'\s|' + person.name + '\s|,', " ", text) text = re.sub(r'\s|' + person.surname + '\s|,', " ", text) text = re.sub(r'\s|' + person.fathername + '\s|,', " ", text) person.notes = text else: return [] return json.dumps(person, default=lambda o: o.__dict__)
CHAR_TABLE = str.maketrans({ key: " " for key in string.punctuation.replace("<", "").replace(">", "") + "…?‘«»‘♂️”“’[]'™" }) NERS = [ (DatesExtractor(), "<date>"), (TimeExtractor(), "<time>"), (ExtraDatesExtractor(), "<date>"), (MoneyExtractor(), "<money>"), (PhoneExtractor(), "<phone>"), (PhotoExtractor(), "<photo>"), (StickerExtractor(), "<sticker>"), (LinkExtractor(), "<url>"), (AddressExtractor(), "<address>"), (NamesExtractor(), "<name>"), (NumberExtractor(), "<number>"), (CensorExtractor(), "<censored>"), ] class EntityExtractor(Component): """Распознает именованные сущности""" def __init__(self, cache=None, verbose=False, **kwargs): super(EntityExtractor, self).__init__() self.kwargs = kwargs self.verbose = verbose self.cache = cache or {} self.extractors = NERS
def extractor(): return AddressExtractor()
def extractor_address(text: str) -> List[Any]: return [match for match in AddressExtractor()(text)]