def main():
    news_sites = {
        'm24.ru': M24_accidents,
        'mosday.ru': Mosday_accidents,
        'vm.ru': VM_accidents
    }

    # Инициализируем Наташу
    extractor = AddressExtractor()

    # Ищем новости, проверяем на наличие адресов, загружаем во временное хранилище
    news_list = []
    for key in news_sites.keys():
        news_list += get_news(news_sites, key, extractor)

    for item in news_list:
        if find_address_in_news(item, extractor):
            #print(item)
            #print(item['location']['coordinates'][0][0], item['location']['coordinates'][0][1])

            record = News(title=item['title'],
                          link=item['link'],
                          date_and_time=datetime.strptime(
                              item['time'] + ' ' + item['date'],
                              '%H:%M %d.%m.%Y'),
                          text=item['text'],
                          address=item['location']['address'],
                          street=item['location']['street'],
                          lat=item['location']['coordinates'][0][0],
                          lon=item['location']['coordinates'][0][1])
            db.session.add(record)
            db.session.commit()
Example #2
0
 def test_nlp(self, text):
     lines = text.splitlines()
     extractor = AddressExtractor()
     for line in lines:
         matches = extractor(line)
         spans = [_.span for _ in matches]
         facts = [_.fact.as_json for _ in matches]
         show_markup(line, spans)
         print(format_json(facts))
Example #3
0
    def parse_address(self):
        """ Парсер адреса

        :return:
        """
        extractor = AddressExtractor()
        matches = extractor(self._text)
        address = []
        for _ in matches:
            if _.fact.as_json.get('parts'):
                row = []
                for x in _.fact.as_json.get('parts'):
                    if x.get('name'):
                        row.append(x.get('name'))
                address.append(', '.join(row))
        return address
Example #4
0
def main():
    news_sites = {
        'm24.ru': M24_accidents,
        'mosday.ru': Mosday_accidents,
        'vm.ru': VM_accidents
    }

    # Инициализируем Наташу
    extractor = AddressExtractor()

    # Ищем новости, проверяем на наличие адресов, загружаем во временное хранилище
    news_list = []
    for key in news_sites.keys():
        news_list += get_news(news_sites, key, extractor)

    for item in news_list:
        if find_address_in_news(item, extractor):
            print(item)
Example #5
0
    def __init__(self):
        from natasha import (NamesExtractor, SimpleNamesExtractor,
                             DatesExtractor, MoneyExtractor,
                             MoneyRateExtractor, MoneyRangeExtractor,
                             LocationExtractor, AddressExtractor,
                             OrganisationExtractor, PersonExtractor)

        addr_ex = AddressExtractor()
        date_ex = DatesExtractor()
        loc_ex = LocationExtractor()
        money_ex = MoneyExtractor()
        money_range_ex = MoneyRangeExtractor()
        money_rate_ex = MoneyRateExtractor()
        name_ex = SimpleNamesExtractor()
        org_ex = OrganisationExtractor()
        person_ex = PersonExtractor()
        # extractors=[addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
        #            name_ex, org_ex, person_ex]
        self.extractors = [
            addr_ex, date_ex, loc_ex, money_ex, money_range_ex, money_rate_ex,
            org_ex, person_ex
        ]
Example #6
0
def send_db_pool_map(doc):
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]

    pool_local = ThreadPool(10)
    ne_full = []
    span_ne_full = []
    type_ne_full = []
    try:
        for extr in extractors:
            global text_paper
            text_paper = doc
            matches = extr(text_paper)
            ne = pool_local.starmap(get_ne2, zip(matches,[doc for x in range(len(matches))]))
            span_ne = pool_local.map(get_span_ne, matches)
            type_ne = pool_local.map(get_type_ne, matches)

            ne_full.append(ne)
            span_ne_full.append(span_ne)
            type_ne_full.append(type_ne)
    except:
        print('Ошибка! Примерный номер =', '?')
    pool_local.close()
    pool_local.join()
    if len(ne_full) != 0:
        ne_for_db = reduce(lambda x, y: x + y, ne_full)
        span_ne_for_db = reduce(lambda x, y: x + y, span_ne_full)
        type_ne_for_db = reduce(lambda x, y: x + y, type_ne_full)
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]
Example #7
0
from flask import Flask, request, jsonify
from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, LocationExtractor)

app = Flask(__name__)

# todo: LocationExtractor работает плохо, его надо настравиать,
# todo: но он умеет находить города, страны и регионы.
# todo: AddressExtractor лучше находит и представляет города,
# todo: но не находит страны и регионы. Он находит улицы и дома,
# todo: нужно исключить их из выдачи и объединить результаты с
# todo: результатами LocationExtractor-а.

names_extractor = NamesExtractor()
address_extractor = AddressExtractor()
dates_extractor = DatesExtractor()
money_extractor = MoneyExtractor()
location_extractor = LocationExtractor()


def find_named_entities(ner_extractor, text):
    """Находит именованные сущности в тексте.

    :param ner_extractor: объект класса NamesExtractor, AddressExtractor,
    DatesExtractor или MoneyExtractor
    :param text: str
    :return: list of namedtuples
    """
    matches = ner_extractor(text)
    return [_.fact.as_json for _ in matches]
Example #8
0
        '''if len(ne_for_db) != 0:
            cur.execute('UPDATE public.news_rbc '
                        'SET ne=%s, span_ne=%s, type_ne=%s'
                        'WHERE id=%s;', (ne_for_db, span_ne_for_db,
                                         type_ne_for_db, num))
            con.commit()'''
        return [ne_for_db, span_ne_for_db, type_ne_for_db]
    else:
        return [0, 0, 0]



if __name__ == '__main__':
    time_begin = time()
    # экстракторы
    extractors = [AddressExtractor(), DatesExtractor(), LocationExtractor(), MoneyExtractor(), NamesExtractor(),
                  OrganisationExtractor(), PersonExtractor()]
    send_to_db_news_rbc(extractors) # 292.92 секунды
    # ошибки: 6911;7168;7561;8246;8539;8691;9211
    exit()
    '''con = psycopg2.connect(database="texts_politics", user="******", password="******", host="127.0.0.1",
                           port="5432", )
    cur = con.cursor()
    pool = ThreadPool(10) # было ошибок 8  - 2459? 2779 = []
    for i in tqdm(range(5059,9540,10)): # 296.92 секунды # с3347по3357 не делал
        # обработало 5839 строк, из них 120 строк не обработаных
        cur.execute("SELECT full_text FROM public.news_rbc ORDER BY id ASC LIMIT 10 OFFSET %s", (i,))
        data = cur.fetchall()
        docs = [x[0] for x in data]
        #new_form = pool.starmap(send_db_pool_map, zip(docs,[i_num for i_num in range(i,i+10)]))
        new_form = pool.map(send_db_pool_map,docs) # 281.43 секунды | 293.59
Example #9
0
def evaluateInput(input_sentence='',
                  encoder=encoder,
                  decoder=decoder,
                  searcher=searcher,
                  voc=voc):
    ex = AddressExtractor()
    line = "найти Санкт-Петербург, улица Федора Абрамова, 9"
    t = {}
    mathes = ex(line)
    for i in range(3):
        t[type(mathes[0].fact.parts[i])] = i
    try:
        if "найти" in input_sentence.lower().lstrip():
            ex = AddressExtractor()
            if ex(input_sentence) and len(ex(input_sentence)) == 1:
                path = 'https://www.google.ru/maps/place/'
                for part in ex(input_sentence)[0].fact.parts:
                    flag = t[type(part)]
                    if flag == 2:
                        if part.number != None:
                            if part.type != None:
                                path += part.type + '+'
                            path += part.number + '+'
                    else:
                        if part.name != None:
                            if part.type != None:
                                path += part.type + '+'
                            if len(part.name.split(' ')) > 1:
                                for word in part.name.split(' '):
                                    path += word + '+'
                            else:
                                path += part.name + '+'

                return path[:-1] + '/'
            else:
                ex = NamesExtractor()
                if ex(input_sentence) and len(ex(input_sentence)) == 1:
                    if ex(input_sentence)[0].fact.first != None and ex(
                            input_sentence)[0].fact.last != None:
                        path = f'https://vk.com/search?c%5Bper_page%5D=40&c%5Bphoto%5D=1&c%5Bq%5D={ex(input_sentence)[0].fact.first}%20{ex(input_sentence)[0].fact.last}&c%5Bsection%5D=people'
                        rec = requests.get(path)
                        vk_mask = 'https://vk.com'
                        tree = lxml.html.fromstring(rec.text)
                        links = tree.xpath(
                            '//a[@class="simple_fit_item search_item"]/@href')
                        if links != []:
                            st = '--list'
                            for i in range(len(links)):
                                st += (vk_mask + links[i] + '\n')
                            return st
                        else:
                            return 'По вашему запросу ничего не найдено'
                else:
                    query = ''
                    for i in range(len(input_sentence) - 5):
                        if input_sentence[i:i + 5].lower(
                        ) == 'найти' and i != len(input_sentence) - 6:
                            query = input_sentence[i + 6:]
                    if query != '':
                        links = list(
                            search(query, tld="co.in", num=10, stop=3,
                                   pause=1))
                        if links != []:
                            st = '--list'
                            for i in range(len(links)):
                                st += (links[i] + '\n')
                            return st
                        else:
                            return 'По вашему запросу ничего не найдено'
                    else:
                        return 'По вашему запросу ничего не найдено'

        else:
            input_sentence = proc.normalizeString(input_sentence)
            output_words = evaluate(encoder, decoder, searcher, voc,
                                    input_sentence)
            output_words[:] = [
                x for x in output_words if not (x == 'EOS' or x == 'PAD')
            ]
            pos = 0
            k = 1
            for i in range(len(output_words) - 1):
                if output_words[i] == output_words[i + 1]:
                    k += 1
                    pos = i + 1
            if k > 2:
                output_words = output_words[:pos]

            return ' '.join(output_words)

    except KeyError:
        return "Мая твая нипанимать :с"
Example #10
0
from ipymarkup import AsciiMarkup, Span, BoxMarkup
import re
import json

from natasha import (NamesExtractor, AddressExtractor, DatesExtractor,
                     MoneyExtractor, OrganisationExtractor, LocationExtractor)
from natasha.markup import show_markup, show_json

extractors = [
    NamesExtractor(),
    AddressExtractor(),
    DatesExtractor(),
    MoneyExtractor(),
    OrganisationExtractor(),
    LocationExtractor()
]

from flask import Flask
from flask import request

app = Flask(__name__)


@app.route('/getFacts', methods=['POST'])
def getFacts():
    print(request.is_json)
    content = request.get_json()

    text = content['text']

    facts = {}
Example #11
0
from ipymarkup import AsciiMarkup, Span, BoxMarkup
import re

from natasha import (
    NamesExtractor,
    AddressExtractor,
    DatesExtractor,
    MoneyExtractor,
    OrganisationExtractor,
    LocationExtractor
)
from natasha.markup import show_markup, show_json

extractors = [
    NamesExtractor(),
    AddressExtractor(),
    DatesExtractor(),
    MoneyExtractor(),
    OrganisationExtractor(),
    LocationExtractor()    
]
addressExtractor = AddressExtractor()



class TextInfo(object):
    def __init__(self, clear_text, addresses):
        self.clear_text = clear_text
        self.addresses

Example #12
0
def extractor_address_str(text: str) -> List[str]:
    return [
        text.__getitem__(slice(*match.span))
        for match in AddressExtractor()(text)
    ]
Example #13
0
 def __init__(self):
     self.address = AddressExtractor()
     self.org = OrganisationExtractor()
     self.dates = DatesExtractor()
def textClassificator(text):
    person = Person()

    phones = re.findall(r'([+(]?[1-9][0-9 .\-()]{8,}[0-9])|([0-9.\-()]{7,})',
                        text)
    person.phone = [value for value in np.array(phones).flatten() if value]

    person.email = re.findall(
        r'([-._a-z0-9]+@(?:[a-z0-9][-a-z0-9]+\.)+[a-z]{2,6})', text)

    symb = re.compile(r'(моб|mob|mail|fax|факс|tel|тел\.|телефон|сайт|site)',
                      re.IGNORECASE)
    symbols = symb.findall(text)
    t = text.split()

    site = re.compile(r'(http:\s*//www.\w+.\w+)|(www.\w+.\w+)', re.IGNORECASE)
    site_list = site.findall(text)
    person.site = [
        re.sub(" ", "", value) for value in np.array(site_list).flatten()
        if value
    ]

    for word in t:
        for substr in symbols:
            if substr in word:
                text = text.replace(word, "")

    for i in range(len(site_list)):
        text = text.replace(np.array(site_list).flatten()[i], "")

    for i in range(len(person.phone)):
        text = text.replace(person.phone[i], "")

    for i in range(len(person.email)):
        text = text.replace(person.email[i], "")

    text = text.replace('\n', " ")
    extractor = AddressExtractor()

    matches = extractor(text)
    facts = [_.fact.as_json for _ in matches]
    if facts:
        address = facts[0]['parts']
        address_list = ''

        for i in range(len(address)):
            keys = reversed(address[i].keys())
            for key in keys:
                address_list += '' + address[i][key] + ' '
                text = re.sub(r'\s' + address[i][key] + '\s|,', " ", text)
        person.address = address_list

    symb = re.compile(
        r'(г\.|город|ул\.|улица|стр\.|строение|д\.|дом|офис|помещение|к\.|корпус)'
    )
    symbols = symb.findall(text)
    t = text.split()

    for word in t:
        for substr in symbols:
            if substr in word:
                text = text.replace(word, "")

    words = text.split()
    nlf_cl = []
    if len(Classifier.classifier[0]):
        pos_n = 0
        pos_s = 0
        pos_f = 0
        for word in words:
            if len(word) > 1:
                nlf_cl.append(
                    classify(Classifier.classifier, get_features(word)))
        max_name = -np.inf
        max_surname = -np.inf
        max_fathername = -np.inf
        for idx in range(len(nlf_cl)):
            key, value = nlf_cl[idx]
            if key == 'Имя':
                if max_name < value[0]:
                    max_name = value[0]
                    pos_n = idx
            if key == 'Фамилия':
                if max_surname < value[0]:
                    max_surname = value[0]
                    pos_s = idx
            if key == 'Отчество':
                if max_fathername < value[0]:
                    max_fathername = value[0]
                    pos_f = idx
        person.name = words[pos_n]
        person.surname = words[pos_s]
        person.fathername = words[pos_f]
        text = re.sub(r'\s|' + person.name + '\s|,', " ", text)
        text = re.sub(r'\s|' + person.surname + '\s|,', " ", text)
        text = re.sub(r'\s|' + person.fathername + '\s|,', " ", text)
        person.notes = text
    else:
        return []
    return json.dumps(person, default=lambda o: o.__dict__)
Example #15
0
CHAR_TABLE = str.maketrans({
    key: " "
    for key in string.punctuation.replace("<", "").replace(">", "") +
    "…?‘«»‘♂️”“’[]'™"
})
NERS = [
    (DatesExtractor(), "<date>"),
    (TimeExtractor(), "<time>"),
    (ExtraDatesExtractor(), "<date>"),
    (MoneyExtractor(), "<money>"),
    (PhoneExtractor(), "<phone>"),
    (PhotoExtractor(), "<photo>"),
    (StickerExtractor(), "<sticker>"),
    (LinkExtractor(), "<url>"),
    (AddressExtractor(), "<address>"),
    (NamesExtractor(), "<name>"),
    (NumberExtractor(), "<number>"),
    (CensorExtractor(), "<censored>"),
]


class EntityExtractor(Component):
    """Распознает именованные сущности"""
    def __init__(self, cache=None, verbose=False, **kwargs):
        super(EntityExtractor, self).__init__()
        self.kwargs = kwargs
        self.verbose = verbose

        self.cache = cache or {}
        self.extractors = NERS
Example #16
0
def extractor():
    return AddressExtractor()
Example #17
0
def extractor_address(text: str) -> List[Any]:
    return [match for match in AddressExtractor()(text)]