Esempio n. 1
0
    def promocode_expiration_date(self) -> 'datetime.datetime | None':
        morph_vocab = MorphVocab()
        dates_extractor = DatesExtractor(morph_vocab)

        now = datetime.datetime.now()
        expiration_date = list(dates_extractor(self.text))

        if not expiration_date:
            del morph_vocab
            del dates_extractor
            return None

        expiration_date = expiration_date[-1].fact  # make order by date

        if not expiration_date.month or not expiration_date.day:
            del morph_vocab
            del dates_extractor
            return None

        year = expiration_date.year or now.year
        month = expiration_date.month
        day = expiration_date.day

        del morph_vocab
        del dates_extractor

        return datetime.datetime(year, month, day)
Esempio n. 2
0
    def __init__(self, debug=False, off_sleeps=False):
        """
        Класс асинхронного парсера объявлений с Avito, раздел "коллекционирование монет"
        :param debug: Включить debug сообщения
        :param off_sleeps: Использовать слипы между запросами (для обхода каптч)
        """
        self._off_sleeps = off_sleeps
        self._loop = uvloop.new_event_loop()
        asyncio.set_event_loop(self._loop)
        self._loop.set_debug(debug)

        self._queue = asyncio.Queue(loop=self._loop)
        self._base_url = 'https://www.avito.ru/moskva/kollektsionirovanie/monety?view=list&p={}'
        self._run_loop = True

        self._log = logging.getLogger(self.__class__.__name__)
        coloredlogs.install(
            fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            level=logging.DEBUG if debug else logging.INFO)

        self._sess = None
        self._user_agent = UserAgent()

        self._db = shelve.open('./parser.db')

        self._n_ads = 0
        self._n_ads_have_year = 0
        self._n_rsfsr_ads = 0
        self._n_rus_empire_ads = 0
        self._dates_extractor = DatesExtractor()
Esempio n. 3
0
def guess_date(doc):
    """Guess what"""
    
    extractor = DatesExtractor()
    matches = extractor(doc)
    if matches:
        dates = [date_fact_to_date(match.fact) for match in matches]
        return max(dates).strftime('%Y-%m-%d')
    else:
        return ''
Esempio n. 4
0
 def get_extractors(cls):
     extractors = getattr(cls, "_extractors", None)
     if not extractors:
         morph_vocab = cls.get_morph_vocab()
         extractors = [
             DatesExtractor(morph_vocab),
             MoneyExtractor(morph_vocab)
         ]
         cls._extractors = extractors
     return extractors
Esempio n. 5
0
def NEL_extraction():
    files = open_initial_file_r()
    files_w = open_natasha_file_w()
    extractor_names = NamesExtractor()
    extractor_dates = DatesExtractor()
    first = middle = last = None
    year = month = day = None
    for j in range(NUM_OF_TOPICS):
        for str in files[j]:
            new_str = NEL_extraction_for_str(str)
            files_w[j].write(new_str)
        files[j].close()
Esempio n. 6
0
    def parse_date(self):
        """ Парсер даты

        :return:
        """
        extractor = DatesExtractor()
        matches = extractor(self._text)
        return [{
            'day': _.fact.as_json.get('day'),
            'month': _.fact.as_json.get('month'),
            'year': _.fact.as_json.get('year')
        } for _ in matches]
Esempio n. 7
0
def get_extractor(extract_type):
    if extract_type == "name":
        #Экстрактор имён
        return NamesExtractor()
    elif extract_type == "location":
        #Экстрактор мест
        return LocationExtractor()
    elif extract_type == "date":
        #Экстрактор дат
        return DatesExtractor()
    elif extract_type == "money":
        #Экстрактор денежных сумм
        return MoneyExtractor()
Esempio n. 8
0
def guess_date(doc):
    """Guess what"""
    res = ""
    try:
        extractor = DatesExtractor()
        matches = extractor(doc)
        if matches:

            dates = [date_fact_to_date(match.fact) for match in matches]
            dates = [dat for dat in dates if dat < datetime.today() and dat > dateparser.parse('2010')]
            if dates:
                res = max(dates).strftime('%Y-%m-%d')
    except Exception as ex:
        print(ex)
    return res
Esempio n. 9
0
def natasha_res(text):
    names_extractor = NamesExtractor()
    location_extractor = LocationExtractor()
    organisation_extractor = OrganisationExtractor()
    dates_extractor = DatesExtractor()
    money_extractor = MoneyExtractor()
    names_mapper = lambda x: text[x.span[0]:x.span[1]]
    location_mapper = names_mapper
    org_mapper = names_mapper
    money_mapper = names_mapper
    res = {
        'names': set(map(names_mapper, names_extractor(text))),
        'locations': set(map(location_mapper, location_extractor(text))),
        'organisations': set(map(org_mapper, organisation_extractor(text))),
        'dates': set(map(dates_mapper, dates_extractor(text))),
        'money': set(map(money_mapper, money_extractor(text))),
    }
    return res
Esempio n. 10
0
def get_date(text):
    text = text.lower()
    doc = Doc(text)

    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    doc.parse_syntax(syntax_parser)

    from natasha import MorphVocab
    morph_vocab = MorphVocab()

    from natasha import DatesExtractor
    dates_extractor = DatesExtractor(morph_vocab)

    if 'завтр' in text or tomorrow in str(list(dates_extractor(text))):
        return "завтра"
    elif 'сегодня' in text or 'сейчас' in text or today in str(
            list(dates_extractor(text))):
        return "сегодня"
    else:
        return None
Esempio n. 11
0
    def __init__(self):
        from natasha import (NamesExtractor, SimpleNamesExtractor,
                             DatesExtractor, MoneyExtractor,
                             MoneyRateExtractor, MoneyRangeExtractor,
                             LocationExtractor, AddressExtractor,
                             OrganisationExtractor, PersonExtractor)

        addr_ex = AddressExtractor()
        date_ex = DatesExtractor()
        loc_ex = LocationExtractor()
        money_ex = MoneyExtractor()
        money_range_ex = MoneyRangeExtractor()
        money_rate_ex = MoneyRateExtractor()
        name_ex = SimpleNamesExtractor()
        org_ex = OrganisationExtractor()
        person_ex = PersonExtractor()
        # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
        #            name_ex, org_ex, person_ex]
        self.extractors = [
            addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
            org_ex, person_ex
        ]
Esempio n. 12
0
def extract_date(text):
    year = None
    day = None
    month = None
    text = text.strip()
    extractor = DatesExtractor()
    matches = extractor(text)
    if len(matches.matches) == 0:
        if 'сегодня' in text:
            date = datetime.now(timezone(time_zone))
            day = date.day
            month = date.month
            year = date.year
        elif 'завтра' in text:
            date = datetime.now(timezone(time_zone)) + timedelta(days=1)
            day = date.day
            month = date.month
            year = date.year
        else:
            return "Не могу распознать дату"
    else:
        for match in matches:
            start, stop = match.span
            if match.fact.year is None:
                year = datetime.now().year
                day = match.fact.day
                month = match.fact.month
            else:
                year = match.fact.year
                day = match.fact.day
                month = match.fact.month
            text = re.sub(text[start:stop], '', text)
    time = find_time(text)
    if time is None:
        hour = '00'
        minute = '00'
    else:
        hour, minute = time
    return year, day, month, hour, minute
Esempio n. 13
0
def send_db_pool_map(doc):
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]

    pool_local = ThreadPool(10)
    ne_full = []
    span_ne_full = []
    type_ne_full = []
    try:
        for extr in extractors:
            global text_paper
            text_paper = doc
            matches = extr(text_paper)
            ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))]))
            span_ne = pool_local.map(get_span_ne, matches)
            type_ne = pool_local.map(get_type_ne, matches)

            ne_full.append(ne)
            span_ne_full.append(span_ne)
            type_ne_full.append(type_ne)
    except:
        print('Ошибка! Примерный номер =', '?')
    pool_local.close()
    pool_local.join()
    if len(ne_full) != 0:
        ne_for_db = reduce(lambda x, y: x + y, ne_full)
        span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full)
        type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full)
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]
Esempio n. 14
0
 def get_dates(self):
     logging.info('Extracting dates')
     extractor = DatesExtractor()
     res = []
     for line in self.doc_lines:
         matches = extractor(line)
         for index, match in enumerate(matches):
             try:
                 res.append(
                     date(match.fact.year, match.fact.month,
                          match.fact.day))
             except TypeError:
                 logging.error(
                     "\"Наташа\" не может распарсить дату в неполном формате %s %s %s"
                     % (match.fact.year, match.fact.month, match.fact.day))
     if len(res) > 1:
         dates = [res.pop(res.index(max(res))), max(res)]
         return dates
     elif len(res) == 1:
         logging.warning("В документе указана только одна дата")
         dates = [res[0], ""]
         return dates
     else:
         logging.warning("Даты не найдены")
Esempio n. 15
0
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]



if __name__ == '__main__':
    time_begin = time()
    # экстракторы
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]
    send_to_db_news_rbc(extractors) # 292.92 секунды
    # ошибки: 6911;7168;7561;8246;8539;8691;9211
    exit()
    '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1",
                           port="5432", )
    cur = con.cursor()
    pool = ThreadPool(10) # было ошибок 8  - 2459? 2779 = []
    for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал
        # обработало 5839 строк, из них 120 строк не обработаных
        cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,))
        data = cur.fetchall()
        docs = [x[0] for x in data]
        #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)]))
        new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
Esempio n. 16
0
from ipymarkup import AsciiMarkup, Span, BoxMarkup
import re
import json

from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, OrganisationExtractor, LocationExtractor)
from natasha.markup import show_markup, show_json

extractors = [
    NamesExtractor(),
    AddressExtractor(),
    DatesExtractor(),
    MoneyExtractor(),
    OrganisationExtractor(),
    LocationExtractor()
]

from flask import Flask
from flask import request

app = Flask(__name__)


@app.route('/getFacts', methods=['POST'])
def getFacts():
    print(request.is_json)
    content = request.get_json()

    text = content['text']

    facts = {}
Esempio n. 17
0
import pandas as pd
import numpy as np

from natasha import (MorphVocab, DatesExtractor)

data = pd.read_table('/content/overhumanized-dev-fp.tsv')

vocab = MorphVocab()
extractor = DatesExtractor(vocab)

result_data = []


def get_date_from_string(s):
    res = []
    matches = [x for x in extractor(s)]
    for mch in matches:
        result = ""
        y = mch.__dict__['fact'].__dict__['year']
        #     m = mch.__dict__['fact'].__dict__['month']
        #    d = mch.__dict__['fact'].__dict__['day']

        if y is not None:
            result += str(y)
    #       if m is not None:
    #          if m//10 == 0:
    #             result += "-0"+str(m)
    #        else:
    #           result += "-"+str(m)
    #      if d is not None:
    #         if d//10 == 0:
Esempio n. 18
0
    "мефодий",
    "спасибо",
] + stopwords.words("russian")
STOP_WORDS += ["не_" + x for x in STOP_WORDS]
STOP_WORDS += ["не"]

# местоимение-существительное, предлог, союз, частица, междометие
STOP_POS = ["NPRO", "PREP", "CONJ", "PRCL", "INTJ"]

CHAR_TABLE = str.maketrans({
    key: " "
    for key in string.punctuation.replace("<", "").replace(">", "") +
    "…?‘«»‘♂️”“’[]'™"
})
NERS = [
    (DatesExtractor(), "<date>"),
    (TimeExtractor(), "<time>"),
    (ExtraDatesExtractor(), "<date>"),
    (MoneyExtractor(), "<money>"),
    (PhoneExtractor(), "<phone>"),
    (PhotoExtractor(), "<photo>"),
    (StickerExtractor(), "<sticker>"),
    (LinkExtractor(), "<url>"),
    (AddressExtractor(), "<address>"),
    (NamesExtractor(), "<name>"),
    (NumberExtractor(), "<number>"),
    (CensorExtractor(), "<censored>"),
]


class EntityExtractor(Component):
 def set_dates_extractor():
     Utility.dates_extractor = DatesExtractor()
Esempio n. 20
0
    re.compile(
        r'кодекс[а-я]*\s+(об\s+)?административн[а-я]*\s+правонарушени[а-я]*', re.IGNORECASE | re.MULTILINE):
    'КоАП',
}

abbrs = [
    (make_regex(r'обществ\w+ с ограниченной ответственностью'), 'ООО'),
    (make_regex(r'открыт\w+ акционерн\w+ обществ\w+'), 'ОАО'),
    (make_regex(r'закрыт\w+ акционерн\w+ обществ\w+'), 'ЗАО'),
    (make_regex(r'публичн\w+ акционерн\w+ обществ\w+'), 'ПАО'),
    (make_regex(r'акционерн\w+ обществ\w+'), 'АО'),
    (make_regex(r'федеральн\w+ казенн\w+ учрежден\w+'), 'ФКУ'),
]

money = MoneyExtractor()
dates = DatesExtractor()
org = OrganisationExtractor()

CAP_SPACES = re.compile(r'\s((?:[А-Я]\s+){2,}[А-Я][^\w])', re.IGNORECASE)


def fix_cap_spaces(text: str):
    return CAP_SPACES.sub(lambda m: ' ' + m.group(1).replace(' ', '') + ' ',
                          text)


def remove_newlines(text: str):
    regex = re.compile(r'([а-яА-Я,"«»()0-9])\s*\n+', re.MULTILINE)
    return regex.sub(r'\1 ', text)

Esempio n. 21
0
def extractor_date(text: str) -> List[Any]:
    return [match for match in DatesExtractor()(text)]
Esempio n. 22
0
from flask import Flask, request, jsonify
from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, LocationExtractor)

app = Flask(__name__)

# todo: LocationExtractor работает плохо, его надо настравиать,
# todo: но он умеет находить города, страны и регионы.
# todo: AddressExtractor лучше находит и представляет города,
# todo: но не находит страны и регионы. Он находит улицы и дома,
# todo: нужно исключить их из выдачи и объединить результаты с
# todo: результатами LocationExtractor-а.

names_extractor = NamesExtractor()
address_extractor = AddressExtractor()
dates_extractor = DatesExtractor()
money_extractor = MoneyExtractor()
location_extractor = LocationExtractor()


def find_named_entities(ner_extractor, text):
    """Находит именованные сущности в тексте.

    :param ner_extractor: объект класса NamesExtractor, AddressExtractor,
    DatesExtractor или MoneyExtractor
    :param text: str
    :return: list of namedtuples
    """
    matches = ner_extractor(text)
    return [_.fact.as_json for _ in matches]
Esempio n. 23
0
def dates_extractor(morph_vocab):
    return DatesExtractor(morph_vocab)
Esempio n. 24
0
import re

import maya
from natasha import DatesExtractor
from photo_load.models import Photo
from rest_framework.response import Response
from rest_framework.status import (HTTP_200_OK, HTTP_204_NO_CONTENT)
from rest_framework.viewsets import ViewSet

from .parser import rel_date_parser

year_regexp = re.compile(r'[\d]{2,4}')
month_year_regexp = re.compile(r'[\d]{1,2}(\.|\s)[\d]{2,4}')
full_date_regexp = re.compile(r'([\d]{1,2}(\.|\s)){2}[\d]{2,4}')
natasha_extractor = DatesExtractor()


def exact_day(field, photos):
    day, month, year = 0, 0, 0
    now = maya.now()

    result = natasha_extractor(field)
    if len(result) == 0:
        if year_regexp.match(field):
            year = int(year_regexp.findall(field)[0])
        elif month_year_regexp.match(field):
            month, year = map(int, re.findall(r'[\d]+', field))
        elif full_date_regexp.match(field):
            day, month, year = map(int, re.findall(r'[\d]+', field))
        elif field.find('сегодня') != -1:
            return photos.filter(photoinfo__time_created__date=now.date)
Esempio n. 25
0
def extractor():
    return DatesExtractor()
    NewsSyntaxParser,
    NewsNERTagger,
    DatesExtractor,
)
from natasha.extractors import Match
from natasha_utils.helpers import find_dates_as_word, parse_natasha_date_to_datetime

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

dates_extractor = DatesExtractor(morph_vocab)


class NatashaExtractor:
    def __init__(self, text: str):
        self.doc = Doc(text)
        self.doc.segment(segmenter)
        self.doc.tag_morph(morph_tagger)
        self.doc.parse_syntax(syntax_parser)
        self.doc.tag_ner(ner_tagger)
        for span in self.doc.spans:
            span.normalize(morph_vocab)

    def find_locations(self) -> List[str]:
        locations = list(
            filter(lambda span: span.type == 'LOC', self.doc.spans))
Esempio n. 27
0
 def __init__(self):
     self.address = AddressExtractor()
     self.org = OrganisationExtractor()
     self.dates = DatesExtractor()
Esempio n. 28
0
def NEL_extraction_for_str(str):
    file = open("Синонимы/Сотрудники библиотеки ФИО.txt", 'r')
    extractor_names = NamesExtractor()
    extractor_dates = DatesExtractor()
    first = middle = last = first_worker = middle_worker = last_worker = None
    year = month = day = None
    new_str = str
    matches_n = extractor_names(str)
    for match in matches_n:
        start, stop = match.span
        first = match.fact.first
        middle = match.fact.middle
        last = match.fact.last
        substr = str[start:stop]
        #        for worker in file:
        #            matches_worker = extractor_names(worker)
        #            for match_worker in matches_worker:
        #                start_worker, stop_worker = match_worker.span
        #                first_worker = match_worker.fact.first
        #                middle_worker = match_worker.fact.middle
        #                last_worker = match_worker.fact.last
        #                substr_worker = str[start_worker:stop_worker]
        #                if first_worker == first and last_worker == last and middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _имя_сотрудника _отчество_сотрудника')
        #                elif last_worker == last and middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника _отчество_сотрудника')
        #                elif first_worker == first and last_worker == last:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _фамилия_сотрудника')
        #                elif first_worker == first and middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника _отчество_сотрудника')
        #                elif first_worker == first:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_имя_сотрудника')
        #                elif middle_worker == middle:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_отчество_сотрудника')
        #                elif last_worker == last:
        #                    new_str = substitution_for_str(new_str, substr_worker, '_фамилия_сотрудника')
        if first and last and middle:
            new_str = substitution_for_str(new_str, substr,
                                           '_фамилия _имя _отчество')
        elif last and middle:
            new_str = substitution_for_str(new_str, substr,
                                           '_фамилия _отчество')
        elif first and last:
            new_str = substitution_for_str(new_str, substr, '_имя _фамилия')
        elif first and middle:
            new_str = substitution_for_str(new_str, substr, '_имя _отчество')
        elif first:
            new_str = substitution_for_str(new_str, substr, '_имя')
        elif middle:
            new_str = substitution_for_str(new_str, substr, '_отчество')
        elif last:
            new_str = substitution_for_str(new_str, substr, '_фамилия')
    matches_d = extractor_dates(new_str)
    for match in matches_d:
        start, stop = match.span
        year = match.fact.year
        month = match.fact.month
        day = match.fact.day
        substr = new_str[start:stop]
        if year and month and day:
            new_str = substitution_for_str(new_str, substr,
                                           '_день _месяц _год')
        elif month and day:
            new_str = substitution_for_str(new_str, substr, '_день _месяц')
        elif year and month:
            new_str = substitution_for_str(new_str, substr, '_месяц _год')
        elif year and day:
            new_str = substitution_for_str(new_str, substr, '_день _год')
        elif day:
            new_str = substitution_for_str(new_str, substr, '_день')
        elif month:
            new_str = substitution_for_str(new_str, substr, '_месяц')
        elif year:
            new_str = substitution_for_str(new_str, substr, '_год')
    return new_str
Esempio n. 29
0
def extractor_date_str(text: str) -> List[str]:
    return [
        text.__getitem__(slice(*match.span))
        for match in DatesExtractor()(text)
    ]