def promocode_expiration_date(self) -> 'datetime.datetime | None': morph_vocab = MorphVocab() dates_extractor = DatesExtractor(morph_vocab) now = datetime.datetime.now() expiration_date = list(dates_extractor(self.text)) if not expiration_date: del morph_vocab del dates_extractor return None expiration_date = expiration_date[-1].fact # make order by date if not expiration_date.month or not expiration_date.day: del morph_vocab del dates_extractor return None year = expiration_date.year or now.year month = expiration_date.month day = expiration_date.day del morph_vocab del dates_extractor return datetime.datetime(year, month, day)
def __init__(self, debug=False, off_sleeps=False): """ Класс асинхронного парсера объявлений с Avito, раздел "коллекционирование монет" :param debug: Включить debug сообщения :param off_sleeps: Использовать слипы между запросами (для обхода каптч) """ self._off_sleeps = off_sleeps self._loop = uvloop.new_event_loop() asyncio.set_event_loop(self._loop) self._loop.set_debug(debug) self._queue = asyncio.Queue(loop=self._loop) self._base_url = 'https://www.avito.ru/moskva/kollektsionirovanie/monety?view=list&p={}' self._run_loop = True self._log = logging.getLogger(self.__class__.__name__) coloredlogs.install( fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG if debug else logging.INFO) self._sess = None self._user_agent = UserAgent() self._db = shelve.open('./parser.db') self._n_ads = 0 self._n_ads_have_year = 0 self._n_rsfsr_ads = 0 self._n_rus_empire_ads = 0 self._dates_extractor = DatesExtractor()
def guess_date(doc): """Guess what""" extractor = DatesExtractor() matches = extractor(doc) if matches: dates = [date_fact_to_date(match.fact) for match in matches] return max(dates).strftime('%Y-%m-%d') else: return ''
def get_extractors(cls): extractors = getattr(cls, "_extractors", None) if not extractors: morph_vocab = cls.get_morph_vocab() extractors = [ DatesExtractor(morph_vocab), MoneyExtractor(morph_vocab) ] cls._extractors = extractors return extractors
def NEL_extraction(): files = open_initial_file_r() files_w = open_natasha_file_w() extractor_names = NamesExtractor() extractor_dates = DatesExtractor() first = middle = last = None year = month = day = None for j in range(NUM_OF_TOPICS): for str in files[j]: new_str = NEL_extraction_for_str(str) files_w[j].write(new_str) files[j].close()
def parse_date(self): """ Парсер даты :return: """ extractor = DatesExtractor() matches = extractor(self._text) return [{ 'day': _.fact.as_json.get('day'), 'month': _.fact.as_json.get('month'), 'year': _.fact.as_json.get('year') } for _ in matches]
def get_extractor(extract_type): if extract_type == "name": #Экстрактор имён return NamesExtractor() elif extract_type == "location": #Экстрактор мест return LocationExtractor() elif extract_type == "date": #Экстрактор дат return DatesExtractor() elif extract_type == "money": #Экстрактор денежных сумм return MoneyExtractor()
def guess_date(doc): """Guess what""" res = "" try: extractor = DatesExtractor() matches = extractor(doc) if matches: dates = [date_fact_to_date(match.fact) for match in matches] dates = [dat for dat in dates if dat < datetime.today() and dat > dateparser.parse('2010')] if dates: res = max(dates).strftime('%Y-%m-%d') except Exception as ex: print(ex) return res
def natasha_res(text): names_extractor = NamesExtractor() location_extractor = LocationExtractor() organisation_extractor = OrganisationExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() names_mapper = lambda x: text[x.span[0]:x.span[1]] location_mapper = names_mapper org_mapper = names_mapper money_mapper = names_mapper res = { 'names': set(map(names_mapper, names_extractor(text))), 'locations': set(map(location_mapper, location_extractor(text))), 'organisations': set(map(org_mapper, organisation_extractor(text))), 'dates': set(map(dates_mapper, dates_extractor(text))), 'money': set(map(money_mapper, money_extractor(text))), } return res
def get_date(text): text = text.lower() doc = Doc(text) doc.segment(segmenter) doc.tag_morph(morph_tagger) doc.parse_syntax(syntax_parser) from natasha import MorphVocab morph_vocab = MorphVocab() from natasha import DatesExtractor dates_extractor = DatesExtractor(morph_vocab) if 'завтр' in text or tomorrow in str(list(dates_extractor(text))): return "завтра" elif 'сегодня' in text or 'сейчас' in text or today in str( list(dates_extractor(text))): return "сегодня" else: return None
def __init__(self): from natasha import (NamesExtractor, SimpleNamesExtractor, DatesExtractor, MoneyExtractor, MoneyRateExtractor, MoneyRangeExtractor, LocationExtractor, AddressExtractor, OrganisationExtractor, PersonExtractor) addr_ex = AddressExtractor() date_ex = DatesExtractor() loc_ex = LocationExtractor() money_ex = MoneyExtractor() money_range_ex = MoneyRangeExtractor() money_rate_ex = MoneyRateExtractor() name_ex = SimpleNamesExtractor() org_ex = OrganisationExtractor() person_ex = PersonExtractor() # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, # name_ex, org_ex, person_ex] self.extractors = [ addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex, org_ex, person_ex ]
def extract_date(text): year = None day = None month = None text = text.strip() extractor = DatesExtractor() matches = extractor(text) if len(matches.matches) == 0: if 'сегодня' in text: date = datetime.now(timezone(time_zone)) day = date.day month = date.month year = date.year elif 'завтра' in text: date = datetime.now(timezone(time_zone)) + timedelta(days=1) day = date.day month = date.month year = date.year else: return "Не могу распознать дату" else: for match in matches: start, stop = match.span if match.fact.year is None: year = datetime.now().year day = match.fact.day month = match.fact.month else: year = match.fact.year day = match.fact.day month = match.fact.month text = re.sub(text[start:stop], '', text) time = find_time(text) if time is None: hour = '00' minute = '00' else: hour, minute = time return year, day, month, hour, minute
def send_db_pool_map(doc): extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] pool_local = ThreadPool(10) ne_full = [] span_ne_full = [] type_ne_full = [] try: for extr in extractors: global text_paper text_paper = doc matches = extr(text_paper) ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))])) span_ne = pool_local.map(get_span_ne, matches) type_ne = pool_local.map(get_type_ne, matches) ne_full.append(ne) span_ne_full.append(span_ne) type_ne_full.append(type_ne) except: print('Ошибка! Примерный номер =', '?') pool_local.close() pool_local.join() if len(ne_full) != 0: ne_for_db = reduce(lambda x, y: x + y, ne_full) span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full) type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full) '''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0]
def get_dates(self): logging.info('Extracting dates') extractor = DatesExtractor() res = [] for line in self.doc_lines: matches = extractor(line) for index, match in enumerate(matches): try: res.append( date(match.fact.year, match.fact.month, match.fact.day)) except TypeError: logging.error( "\"Наташа\" не может распарсить дату в неполном формате %s %s %s" % (match.fact.year, match.fact.month, match.fact.day)) if len(res) > 1: dates = [res.pop(res.index(max(res))), max(res)] return dates elif len(res) == 1: logging.warning("В документе указана только одна дата") dates = [res[0], ""] return dates else: logging.warning("Даты не найдены")
'''if len(ne_for_db) != 0: cur.execute('UPDATE public.news_rbc ' 'SET ne=%s, span_ne=%s, type_ne=%s' 'WHERE id=%s;', (ne_for_db, span_ne_for_db, type_ne_for_db, num)) con.commit()''' return [ne_for_db, span_ne_for_db, type_ne_for_db] else: return [0, 0, 0] if __name__ == '__main__': time_begin = time() # экстракторы extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(), OrganisationExtractor(), PersonExtractor()] send_to_db_news_rbc(extractors) # 292.92 секунды # ошибки: 6911;7168;7561;8246;8539;8691;9211 exit() '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1", port="5432", ) cur = con.cursor() pool = ThreadPool(10) # было ошибок 8 - 2459? 2779 = [] for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал # обработало 5839 строк, из них 120 строк не обработаных cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,)) data = cur.fetchall() docs = [x[0] for x in data] #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)])) new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
from ipymarkup import AsciiMarkup, Span, BoxMarkup import re import json from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, OrganisationExtractor, LocationExtractor) from natasha.markup import show_markup, show_json extractors = [ NamesExtractor(), AddressExtractor(), DatesExtractor(), MoneyExtractor(), OrganisationExtractor(), LocationExtractor() ] from flask import Flask from flask import request app = Flask(__name__) @app.route('/getFacts', methods=['POST']) def getFacts(): print(request.is_json) content = request.get_json() text = content['text'] facts = {}
import pandas as pd import numpy as np from natasha import (MorphVocab, DatesExtractor) data = pd.read_table('/content/overhumanized-dev-fp.tsv') vocab = MorphVocab() extractor = DatesExtractor(vocab) result_data = [] def get_date_from_string(s): res = [] matches = [x for x in extractor(s)] for mch in matches: result = "" y = mch.__dict__['fact'].__dict__['year'] # m = mch.__dict__['fact'].__dict__['month'] # d = mch.__dict__['fact'].__dict__['day'] if y is not None: result += str(y) # if m is not None: # if m//10 == 0: # result += "-0"+str(m) # else: # result += "-"+str(m) # if d is not None: # if d//10 == 0:
"мефодий", "спасибо", ] + stopwords.words("russian") STOP_WORDS += ["не_" + x for x in STOP_WORDS] STOP_WORDS += ["не"] # местоимение-существительное, предлог, союз, частица, междометие STOP_POS = ["NPRO", "PREP", "CONJ", "PRCL", "INTJ"] CHAR_TABLE = str.maketrans({ key: " " for key in string.punctuation.replace("<", "").replace(">", "") + "…?‘«»‘♂️”“’[]'™" }) NERS = [ (DatesExtractor(), "<date>"), (TimeExtractor(), "<time>"), (ExtraDatesExtractor(), "<date>"), (MoneyExtractor(), "<money>"), (PhoneExtractor(), "<phone>"), (PhotoExtractor(), "<photo>"), (StickerExtractor(), "<sticker>"), (LinkExtractor(), "<url>"), (AddressExtractor(), "<address>"), (NamesExtractor(), "<name>"), (NumberExtractor(), "<number>"), (CensorExtractor(), "<censored>"), ] class EntityExtractor(Component):
def set_dates_extractor(): Utility.dates_extractor = DatesExtractor()
re.compile( r'кодекс[а-я]*\s+(об\s+)?административн[а-я]*\s+правонарушени[а-я]*', re.IGNORECASE | re.MULTILINE): 'КоАП', } abbrs = [ (make_regex(r'обществ\w+ с ограниченной ответственностью'), 'ООО'), (make_regex(r'открыт\w+ акционерн\w+ обществ\w+'), 'ОАО'), (make_regex(r'закрыт\w+ акционерн\w+ обществ\w+'), 'ЗАО'), (make_regex(r'публичн\w+ акционерн\w+ обществ\w+'), 'ПАО'), (make_regex(r'акционерн\w+ обществ\w+'), 'АО'), (make_regex(r'федеральн\w+ казенн\w+ учрежден\w+'), 'ФКУ'), ] money = MoneyExtractor() dates = DatesExtractor() org = OrganisationExtractor() CAP_SPACES = re.compile(r'\s((?:[А-Я]\s+){2,}[А-Я][^\w])', re.IGNORECASE) def fix_cap_spaces(text: str): return CAP_SPACES.sub(lambda m: ' ' + m.group(1).replace(' ', '') + ' ', text) def remove_newlines(text: str): regex = re.compile(r'([а-яА-Я,"«»()0-9])\s*\n+', re.MULTILINE) return regex.sub(r'\1 ', text)
def extractor_date(text: str) -> List[Any]: return [match for match in DatesExtractor()(text)]
from flask import Flask, request, jsonify from natasha import (NamesExtractor, AddressExtractor, DatesExtractor, MoneyExtractor, LocationExtractor) app = Flask(__name__) # todo: LocationExtractor работает плохо, его надо настравиать, # todo: но он умеет находить города, страны и регионы. # todo: AddressExtractor лучше находит и представляет города, # todo: но не находит страны и регионы. Он находит улицы и дома, # todo: нужно исключить их из выдачи и объединить результаты с # todo: результатами LocationExtractor-а. names_extractor = NamesExtractor() address_extractor = AddressExtractor() dates_extractor = DatesExtractor() money_extractor = MoneyExtractor() location_extractor = LocationExtractor() def find_named_entities(ner_extractor, text): """Находит именованные сущности в тексте. :param ner_extractor: объект класса NamesExtractor, AddressExtractor, DatesExtractor или MoneyExtractor :param text: str :return: list of namedtuples """ matches = ner_extractor(text) return [_.fact.as_json for _ in matches]
def dates_extractor(morph_vocab): return DatesExtractor(morph_vocab)
import re import maya from natasha import DatesExtractor from photo_load.models import Photo from rest_framework.response import Response from rest_framework.status import (HTTP_200_OK, HTTP_204_NO_CONTENT) from rest_framework.viewsets import ViewSet from .parser import rel_date_parser year_regexp = re.compile(r'[\d]{2,4}') month_year_regexp = re.compile(r'[\d]{1,2}(\.|\s)[\d]{2,4}') full_date_regexp = re.compile(r'([\d]{1,2}(\.|\s)){2}[\d]{2,4}') natasha_extractor = DatesExtractor() def exact_day(field, photos): day, month, year = 0, 0, 0 now = maya.now() result = natasha_extractor(field) if len(result) == 0: if year_regexp.match(field): year = int(year_regexp.findall(field)[0]) elif month_year_regexp.match(field): month, year = map(int, re.findall(r'[\d]+', field)) elif full_date_regexp.match(field): day, month, year = map(int, re.findall(r'[\d]+', field)) elif field.find('сегодня') != -1: return photos.filter(photoinfo__time_created__date=now.date)
def extractor(): return DatesExtractor()
NewsSyntaxParser, NewsNERTagger, DatesExtractor, ) from natasha.extractors import Match from natasha_utils.helpers import find_dates_as_word, parse_natasha_date_to_datetime segmenter = Segmenter() morph_vocab = MorphVocab() emb = NewsEmbedding() morph_tagger = NewsMorphTagger(emb) syntax_parser = NewsSyntaxParser(emb) ner_tagger = NewsNERTagger(emb) dates_extractor = DatesExtractor(morph_vocab) class NatashaExtractor: def __init__(self, text: str): self.doc = Doc(text) self.doc.segment(segmenter) self.doc.tag_morph(morph_tagger) self.doc.parse_syntax(syntax_parser) self.doc.tag_ner(ner_tagger) for span in self.doc.spans: span.normalize(morph_vocab) def find_locations(self) -> List[str]: locations = list( filter(lambda span: span.type == 'LOC', self.doc.spans))
def __init__(self): self.address = AddressExtractor() self.org = OrganisationExtractor() self.dates = DatesExtractor()
def NEL_extraction_for_str(str): file = open("Синонимы/Сотрудники библиотеки ФИО.txt", 'r') extractor_names = NamesExtractor() extractor_dates = DatesExtractor() first = middle = last = first_worker = middle_worker = last_worker = None year = month = day = None new_str = str matches_n = extractor_names(str) for match in matches_n: start, stop = match.span first = match.fact.first middle = match.fact.middle last = match.fact.last substr = str[start:stop] # for worker in file: # matches_worker = extractor_names(worker) # for match_worker in matches_worker: # start_worker, stop_worker = match_worker.span # first_worker = match_worker.fact.first # middle_worker = match_worker.fact.middle # last_worker = match_worker.fact.last # substr_worker = str[start_worker:stop_worker] # if first_worker == first and last_worker == last and middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _имя_сотрудника _отчество_сотрудника') # elif last_worker == last and middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _отчество_сотрудника') # elif first_worker == first and last_worker == last: # new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _фамилия_сотрудника') # elif first_worker == first and middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _отчество_сотрудника') # elif first_worker == first: # new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника') # elif middle_worker == middle: # new_str = substitution_for_str(new_str, substr_worker, '_отчество_сотрудника') # elif last_worker == last: # new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника') if first and last and middle: new_str = substitution_for_str(new_str, substr, '_фамилия _имя _отчество') elif last and middle: new_str = substitution_for_str(new_str, substr, '_фамилия _отчество') elif first and last: new_str = substitution_for_str(new_str, substr, '_имя _фамилия') elif first and middle: new_str = substitution_for_str(new_str, substr, '_имя _отчество') elif first: new_str = substitution_for_str(new_str, substr, '_имя') elif middle: new_str = substitution_for_str(new_str, substr, '_отчество') elif last: new_str = substitution_for_str(new_str, substr, '_фамилия') matches_d = extractor_dates(new_str) for match in matches_d: start, stop = match.span year = match.fact.year month = match.fact.month day = match.fact.day substr = new_str[start:stop] if year and month and day: new_str = substitution_for_str(new_str, substr, '_день _месяц _год') elif month and day: new_str = substitution_for_str(new_str, substr, '_день _месяц') elif year and month: new_str = substitution_for_str(new_str, substr, '_месяц _год') elif year and day: new_str = substitution_for_str(new_str, substr, '_день _год') elif day: new_str = substitution_for_str(new_str, substr, '_день') elif month: new_str = substitution_for_str(new_str, substr, '_месяц') elif year: new_str = substitution_for_str(new_str, substr, '_год') return new_str
def extractor_date_str(text: str) -> List[str]: return [ text.__getitem__(slice(*match.span)) for match in DatesExtractor()(text) ]