Ejemplo n.º 1
0
def emb():
    meta = Meta(
        id='test_1B_3k_6d_2q'
    )
    pq = PQ(
        vectors=3,
        dim=6,
        qdim=2,
        # 1 0 0 | 1 0 0
        # 0 1 1 | 0 0 0
        # 0 0 0 | 0 1 0
        centroids=3,
        indexes=np.array([  # vectors x qdim
            [0, 1],
            [1, 0],
            [2, 2]
        ]).astype(np.uint8),
        codes=np.array([  # qdim x centroids x chunk
            [[1, 0, 0], [0, 1, 1], [0, 0, 0]],
            [[0, 0, 0], [1, 0, 0], [0, 1, 0]],
        ]).astype(np.float32),
    )
    vocab = Vocab(
        words=['a', 'b', 'c'],
        counts=[1, 2, 3]
    )
    return Navec(meta, vocab, pq)
Ejemplo n.º 2
0
 def __init__(self):
     self.navec = Navec.load(Loc.dependencies_path /
                             'navec_news_v1_1B_250K_300d_100q.tar')
     self.syntax = Syntax.load(Loc.dependencies_path /
                               'slovnet_syntax_news_v1.tar')
     self.syntax.navec(self.navec)
     self.morph = Morph.load(Loc.dependencies_path /
                             'slovnet_morph_news_v1.tar')
     self.morph.navec(self.navec)
Ejemplo n.º 3
0
def quantize_(emb, output, subdim, sample, iterations):
    with open(emb) as file:
        log_info('Load %s', emb)
        words, weights = parse_glove_emb(file)
        log_info(
            'PQ, subdim: %d, sample: %d, iterations: %d',
            subdim, sample, iterations
        )
        pq = quantize__(weights, subdim, sample, iterations)
        vocab = Vocab(words)
        log_info('Dump %s', output)
        Navec(vocab, pq).dump(output)
Ejemplo n.º 4
0
def pack(args):
    meta = Meta(args.id)

    with open_bin(args.vocab) as file:
        vocab = Vocab.from_file(file)

    with open_bin(args.pq) as file:
        pq = PQ.from_file(file)

    path = 'navec_%s.tar' % args.id
    log_info('Dumping %s', path)
    Navec(meta, vocab, pq).dump(path)
Ejemplo n.º 5
0
def pack_(vocab, pq, id):
    meta = Meta(id)

    with open_bin(vocab) as file:
        vocab = Vocab.from_file(file)

    with open_bin(pq) as file:
        pq = PQ.from_file(file)

    path = 'navec_%s.tar' % id
    log_info('Dumping %s', path)
    Navec(meta, vocab, pq).dump(path)
    def build_annoy_forest(self, navec_path, index_path):
        # load navec vector file and extract vocabulary
        navec = Navec.load(navec_path)
        vocabulary = navec.vocab.words

        # build annoy forest
        dim = 300
        tree_count = 100
        forest = AnnoyIndex(dim, 'angular')

        for i, word in enumerate(vocabulary):
            forest.add_item(i, navec[word])

        forest.build(tree_count)
        forest.save(index_path)
Ejemplo n.º 7
0
def load_ner(models_path: str) -> NER:
    """Загружаем и инициализируем NER-модель

    Args:
        models_path (str): Папка, в которой расположены необходимые для работы модели

    Returns:
        slovnet.NER: Объект slovnet.NER
    """
    os.makedirs(models_path, exist_ok=True)
    if not os.path.isfile(os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar')):
        wget.download('https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar',
                      os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar'))
    if not os.path.isfile(os.path.join(models_path, 'slovnet_ner_news_v1.tar')):
        wget.download('https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_ner_news_v1.tar',
                      os.path.join(models_path, 'slovnet_ner_news_v1.tar'))
    navec = Navec.load(os.path.join(models_path, 'navec_news_v1_1B_250K_300d_100q.tar'))
    ner = NER.load(os.path.join(models_path, 'slovnet_ner_news_v1.tar'))
    ner.navec(navec)
    return ner
Ejemplo n.º 8
0
    def shop_name(self) -> str:
        navec = Navec.load(constants.navec_file)
        ner = NER.load(constants.ner_file)
        ner.navec(navec)

        try:
            markup = ner(self.text)
        except IndexError:
            # i dont know what happens here sometimes
            del navec
            del ner
            return ""

        for span in markup.spans:
            if span.type == 'ORG':
                del navec
                del ner
                return self.text[span.start:span.stop].strip(".,;!:-–—/ ")

        del navec
        del ner

        return ""
Ejemplo n.º 9
0
 def __init__(self, is_elmo_used=False):
     self.config = get_config('config.yml')
     self.parser = ConsultantPlusParser(config=self.config)
     self.model = ElmoModel()
     self.mystem = Mystem()
     self.spec_chars = string.punctuation + '\n\xa0«»\t—…'
     self.stop_words = stopwords.words("russian")
     self.stop_words.extend([
         'и',
         'в',
         'на',
         'n',
         'рф',
         'гк',
         'юридического',
         ' ',
         '1',
         'ред',
         '2',
         'ст',
         'также',
         'свой',
         'либо',
         'это',
         'текст',
         'закон',
         'который',
         'иной',
         'год',
         'мочь',
     ])
     if is_elmo_used:
         self.model.load(self.config['model_info_file'])
     self.navec = Navec.load(self.config['navec_news_v1_1B_250K_300d_100q'])
     self.syntax = Syntax.load(self.config['slovnet_syntax_news_v1'])
     self.syntax.navec(self.navec)
Ejemplo n.º 10
0
def test_dump_load(emb):
    with NamedTemporaryFile() as file:
        path = file.name
        emb.dump(path)
        Navec.load(path)
Ejemplo n.º 11
0
from aiohttp import web

from navec import Navec
from slovnet import Morph

NAVEC = getenv('NAVEC', 'navec.tar')
PACK = getenv('PACK', 'pack.tar')
BATCH_SIZE = int(getenv('BATCH_SIZE', 8))

HOST = getenv('HOST', '0.0.0.0')
PORT = int(getenv('PORT', 8080))
MB = 1024 * 1024
MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB))

log('Load navec: %r' % NAVEC)
navec = Navec.load(NAVEC)

log('Load pack: %r' % PACK)
log('Batch size: %r' % BATCH_SIZE)
morph = Morph.load(PACK)
morph.navec(navec)


async def handle(request):
    chunk = await request.json()
    log('Post chunk size: %r' % len(chunk))
    markups = list(morph.map(chunk))

    tags = sum(len(_.tags) for _ in markups)
    log('Infer tags: %r', tags)
Ejemplo n.º 12
0
 def __init__(self, path):
     meta, vocab, pq = Navec.load(path)
     Navec.__init__(self, meta, vocab, pq)
Ejemplo n.º 13
0
def navec():
    path = download(
        'https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar'
    )
    return Navec.load(path)
Ejemplo n.º 14
0
from pprint import pprint

from mpl_toolkits.mplot3d import Axes3D
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import gensim.downloader as api
import pandas as pd

from games import RED, BLUE, GREY, game

from navec import Navec
path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
model = Navec.load(path).as_gensim
# model = api.load("word2vec-ruscorpora-300")


def _(model, name, noun=True):
    words = model.index2word
    if noun:
        ws = [
            w for w in words if w.split('_')[0] == name.lower() and (
                len(w.split('_')) == 1 or w.split('_')[1] == 'NOUN')
        ]
    else:
        ws = [w for w in words if w.split('_')[0] == name.lower()]
    if len(ws) == 0:
        raise KeyError(name, ws)
    if len(ws) > 1:
        raise KeyError(name, ws)
    key = ws[0]
    return key
Ejemplo n.º 15
0
 def _load(self):
     raw = Navec.load(self.path)
     return NavecModel(raw, self.stats)
Ejemplo n.º 16
0
    record, k = get_random_record(records)
    markup = ner(record.text)
    print('This is ' + tp.BOLD + tp.RED + f'{k}' + tp.END + ' record\n')
    show_markup(markup.text, markup.spans)


def test_on_k_random_records(K):
    records = load_lenta(lenta_path)
    records_num = [i for i in range(N)]
    chosen_records_num = random.choices(records_num, k=K)
    my_records = []
    for i in chosen_records_num:
        my_records.append(get_k_record(records, i))

    print(f'This is ' + tp.BOLD + tp.RED + f'{chosen_records_num}' + tp.END +
          ' records\n')

    for i in range(K):
        print(tp.BOLD + tp.RED + f'{chosen_records_num[i]}' + tp.END + '\t')
        markup = ner(my_records[i].text)
        show_markup(markup.text, markup.spans)
        print('\n--------------------------\n\n')


if __name__ == '__main__':
    print()
    navec = Navec.load(navec_path)
    ner = NER.load(ner_path)
    ner.navec(navec)
    test_on_random_record()
    test_on_k_random_records(5)
Ejemplo n.º 17
0
def test_integration(tmpdir):
    torch.manual_seed(1)
    device = get_device()

    navec = Navec.load(NAVEC)

    words_vocab = WordsVocab(navec.vocab.words)
    shapes_vocab = ShapesVocab(SHAPES)
    tags_vocab = TagsVocab([PER, LOC, ORG])

    word_emb = NavecEmbedding.from_navec(navec)
    shape_emb = ShapeEmbedding(
        vocab_size=len(shapes_vocab),
        dim=10,
        pad_id=shapes_vocab.pad_id
    )
    word_model = WordModel(
        word_emb,
        shape_emb
    )
    context_model = CNNContextModel(
        input_dim=word_model.dim,
        layer_dims=[64, 32],
        kernel_size=3,
    )
    tag_model = CRFTagModel(
        input_dim=context_model.dim,
        tags_num=len(tags_vocab)
    )
    ner_model = NERModel(
        word_model,
        context_model,
        tag_model
    ).to(device)

    dataset = NerusDataset(NERUS)
    test_dataset = dataset.slice(0, 10)
    train_dataset = dataset.slice(10, 30)

    token_encoder = StackEncoder([
        WordEncoder(words_vocab),
        ShapeEncoder(shapes_vocab)
    ])
    markup_encoder = MarkupEncoder(
        token_encoder,
        TagEncoder(tags_vocab)
    )

    tokenizer = Tokenizer()
    batch_encoder = BatchEncoder(
        tokenizer,
        markup_encoder,
        seq_len=100,
        batch_size=32,
        shuffle_buffer_size=256,
    )

    test_batches = [_.to(device) for _ in batch_encoder.map(test_dataset)]
    train_batches = [_.to(device) for _ in batch_encoder.map(train_dataset)]

    path = str(tmpdir.mkdir('root'))
    board = Board('01', path)
    train_board = board.prefixed('01_train')
    test_board = board.prefixed('02_test')

    optimizer = optim.Adam(
        ner_model.parameters(),
        lr=0.001
    )
    proced = train_model(
        ner_model, optimizer,
        train_batches
    )
    scores = eval_batches(tags_vocab, proced)
    score = avg_batch_scores(scores)
    train_board.add_batch_score(score)

    proced = infer_model(
        ner_model,
        test_batches
    )
    scores = eval_batches(tags_vocab, proced)
    score = avg_batch_scores(scores)
    test_board.add_batch_score(score)

    ner_model.eval()
    tagger = NERTagger(
        tokenizer,
        token_encoder,
        tags_vocab,
        ner_model,
        device
    )
    markup1 = tagger(TEXT)

    pack = ner_model.as_infer.pack('slovnet_ner_v1')
    path = str(tmpdir.join('slovnet_ner_v1.tar'))
    pack.dump(path)

    pack = Pack.load(path)
    pack.context.navec = InferNavecEmbedding.from_navec(navec)
    ner_model = pack.scheme.to_impl(pack.context)

    tagger = InferNERTagger(
        tokenizer,
        token_encoder,
        tags_vocab,
        ner_model
    )
    markup2 = tagger(TEXT)

    assert markup1 == markup2
Ejemplo n.º 18
0
    '\nКогда вам надоест со мной говорить, скажите "выход".')
TEXT_FAREWELL = 'Всего доброго! Если захотите повторить, скажите "Алиса, включи навык тест tgalice".'

if __name__ == '__main__':
    mongo_url = os.environ.get('MONGODB_URI')
    if mongo_url:
        mongo_client = MongoClient(mongo_url)
        mongo_db = mongo_client.get_default_database()
    else:
        mongo_client = mongomock.MongoClient()
        mongo_db = mongo_client.db
    mongo_logs = mongo_db.get_collection('message_logs')

    prerelease.download_if_not_exists(prerelease.navec_url,
                                      prerelease.navec_file)
    w2v = Navec.load(prerelease.navec_file)

    manager = tgalice.dialog_manager.CascadeDialogManager(
        tgalice.dialog_manager.FAQDialogManager(
            'faq.yaml', matcher=tgalice.nlu.matchers.W2VMatcher(w2v=w2v)),
        tgalice.dialog_manager.GreetAndHelpDialogManager(
            greeting_message=TEXT_HELP,
            help_message=TEXT_HELP,
            default_message='Я вас не понимаю.',
            exit_message='Всего доброго! Было приятно с вами пообщаться!'))
    connector = tgalice.dialog_connector.DialogConnector(
        dialog_manager=manager,
        storage=tgalice.session_storage.MongoBasedStorage(
            database=mongo_db, collection_name='sessions'),
        log_storage=tgalice.storage.message_logging.MongoMessageLogger(
            database=mongo_db, detect_pings=True)
Ejemplo n.º 19
0
 def __init__(self, metricType: str = "tf"):
     self._metric = metricType
     if self._metric == 'emb' or self._metric == 'embm':
         self._embModel = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
Ejemplo n.º 20
0
import pandas as pd
from navec import Navec
import aiohttp
from aiogram import Bot, Dispatcher, executor, types

# import from other modules
from functions import get_groups, sort_list, check_value
from DB import read_list, insert_into_list, drop, delete_from_list
from config import token, ids

logging.basicConfig(level=logging.INFO)

bot = Bot(token=token)
dp = Dispatcher(bot)

navec = Navec.load('navec_hudlit_v1_12B_500K_300d_100q.tar')


def auth(func):  # authentication wrapper
    async def wrapper(message):
        if not (message['from']['id'] in ids):
            return await message.reply("Доступ закрыт", reply=False)
        return await func(message)

    return wrapper


@dp.message_handler(commands=['start']
                    )  # Diplays the message with the user id.
async def welcome(message: types.Message):
    await message.answer('Ваш id ' + str(message['from']['id']) + '\n' +
Ejemplo n.º 21
0
#
import numpy as np
import re
import pymorphy2
from razdel import tokenize
from navec import Navec
from pyaspeller import Word
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score
from engine import *
import config
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Run topic prediction script")
    parser.add_argument("-m",
                        "--message",
                        type=str,
                        help="Past your message to classify",
                        default="мне нужно посетить врача")
    args = parser.parse_args()
    morph = pymorphy2.MorphAnalyzer()
    w2v = Navec.load(config.path)
    keyword_matrix = prepare_keyword_vectors(config.keywords, w2v)
    processed_text = preprocess(args.message, morph, config.normalized)
    print(config.inv_mapping[predict_label(processed_text, keyword_matrix,
                                           w2v)])
Ejemplo n.º 22
0
from navec import Navec
from numpy import linalg as ln
import numpy as np
import time
import json
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from random import random
from slovnet.model.emb import NavecEmbedding
import torch

path = 'navec_news_v1_1B_250K_300d_100q.tar'
navec = Navec.load(path)
# emb = NavecEmbedding(navec)

# print(ln.norm(navec['каспийский'] - navec['море']))
# navec.vocab

# path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
# navec = Navec.load(path)

# sentences1 = [
#                 [[0.1, 0.3,0.7,0.8,0.9], [0.1, 0.12,0.15,0.21,0.24],
#                 [0.1, 0.3,0.7,0.8,0.9], [0.1, 0.3,0.7,0.8,0.9],
#                 [0.1, 0.3,0.7,0.8,0.9]],

#                 [[0.01, 0.08,0.12,0.13,0.13], [0.58, 0.59,0.63,0.63,0.64],
#                 [0.1, 0.3,0.7,0.8,0.9],
#                 [0.2, 0.22,0.23,0.25,0.28]],
Ejemplo n.º 23
0
 def __init__(self, model_path, vector_model_path):
     navec = Navec.load(vector_model_path)
     self.model = NER.load(model_path)
     self.model.navec(navec)
'''
import spacy
import pandas as pd
from razdel import sentenize, tokenize
from navec import Navec
from slovnet import Syntax
from slovnet import Morph
from pymystem3 import Mystem
class Tokens:
    def __init__(self, text, lemma_, pos_):
        self.text = text
        self.lemma_ = lemma_
        self.pos_=pos_

m = Mystem()
navec = Navec.load('navec_news_v1_1B_250K_300d_100q.tar')
morph = Morph.load('slovnet_morph_news_v1.tar', batch_size=4)
morph.navec(navec)

text="Европейский союз добавил в санкционный список девять политических деятелей из самопровозглашенных республик Донбасса — Донецкой народной республики (ДНР) и Луганской народной республики (ЛНР) — в связи с прошедшими там выборами. Об этом говорится в документе, опубликованном в официальном журнале Евросоюза."
def nlp(text):
    chunks=[]
    lemmaSent=[]
    Doc=[]
    for sent in sentenize(text):
        tokens = [_.text for _ in tokenize(sent.text)]
        chunks.append(tokens)

    for chunk in chunks:
        filteredChunk=list(filter(lambda a: a != ' ', chunk))
        markup = next(morph.map([filteredChunk]))
Ejemplo n.º 25
0
 def __init__(self, path_to_navec_data, path_to_syntax_data):
     self.navec = Navec.load(path_to_navec_data)
     self.syntax = Syntax.load(path_to_syntax_data).navec(self.navec)
Ejemplo n.º 26
0
import time
import os.path

from django.conf import settings

from navec import Navec
from numpy import dot
from numpy.linalg import norm
from annoy import AnnoyIndex


NAVEC_PATH = os.path.join(settings.ROOT_DIR, "parser_tool", "data", "navec_hudlit_v1_12B_500K_300d_100q.tar")
ANNOY_INDEX_PATH = os.path.join(settings.ROOT_DIR, "parser_tool", "data", "ANNOY_tree.ann")

navec = Navec.load(NAVEC_PATH)
vocabulary = navec.vocab.words
word_to_index = dict()
for i, word in enumerate(vocabulary):
    word_to_index[word] = i

lsh = None # global that holds the Annoy tree

def load_annoy_index():
    ''' 
    Lazy load the Annoy LSH tree.

    This is wrapped in a function to avoid import errors if the index file
    is not present at that time.
    '''
    global lsh
    if lsh is None:
Ejemplo n.º 27
0
def test_ner_tagger():
    navec = Navec.load(NAVEC)
    tagger = NERTagger.load(SLOVNET, navec)
    markup = tagger(TEXT)
    guess = [TEXT[_.start:_.stop] for _ in markup.spans]
    assert guess == ETALON