コード例 #1
0
    def __init__(self, loader_obj):
        self.functions_dict = {
            "include_and": include_and,
            "include_or": include_or,
            "exclude_and": exclude_and,
            "exclude_or": exclude_or,
            "include_str": include_str,
            "include_str_p": include_str_p,
            "exclude_str_p": exclude_str_p,
            "intersec_share": intersec_share
        }
        self.model = loader_obj
        assert self.model.model_type == "simple_rules", "тип модели не соответствует классу SimpleRules"

        self.tokenizer = TokenizerApply(self.model)
        if not self.model.is_etalons_lemmatize:
            self.model.application_field[
                "texts"] = self.tokenizer.texts_processing(
                    self.model.application_field["texts"])

        self.model_params = list(
            zip(self.model.application_field["tags"],
                self.model.application_field["rules"],
                self.model.application_field["texts"],
                self.model.application_field["coeff"]))

        # grouping rules with the same tag
        self.model_params_grouped = model_params_grouped(self.model_params)
コード例 #2
0
def init():
    # тут поднять всякие модели и пр.
    models_rout = os.path.dirname(__file__)

    # загрузка моделей для каждого pubid и формирование словаря, в котором pubid являются ключами,
    # значениями классифицирующие модели и связки тегов с айди ответов (и модулей)
    models_dict = {"simple_rules_model": None}

    # загрузка моделей и превращение их сразу в loader_obj
    for model_name in models_dict:
        with open(
                os.path.join(models_rout, 'models/tax_tags',
                             str(model_name) + ".pickle"), "br") as f:
            model = pickle.load(f)
            models_dict[model_name] = model

    # загрузим лемматизатор для паттернов:
    tknz = TokenizerApply(Loader(models_dict["simple_rules_model"]))

    global pattern1, pattern2
    # лемматизируем паттерны для обора фрагментов
    pattern1 = tknz.texts_processing(["в ходе проведения"])[0]
    pattern2 = tknz.texts_processing(["В течение 5 <4> рабочих дней"])[0]
    """определение моделей, которые потом используются для разных pubid"""
    model_1 = ModelsChain([(SimpleRules, models_dict["simple_rules_model"])])

    global pub_models
    pub_models = {
        1: {
            "model": model_1,
            "tag_answ_link": None,
            "tokenizer": tknz
        }
    }
コード例 #3
0
 def __init__(self, loader_obj):
     self.model_types = [("lsi", None)]
     self.model = loader_obj
     self.tknz = TokenizerApply(self.model)
     self.tkz_model = self.tknz.model_tokenize()
     self.et_vectors = self.tkz_model.application_field["texts"]
     self.coeffs = self.tkz_model.application_field["coeff"]
     self.tags = self.tkz_model.application_field["tags"]
     self.index = Similarity(
         None,
         self.et_vectors,
         num_features=self.model.texts_algorithms["num_topics"])
コード例 #4
0
 def __init__(self, loader_obj):
     self.model_types = [("simple_rules", None)]
     self.functions_dict = {
         "include_and": self.include_and,
         "include_or": self.include_or,
         "exclude_and": self.exclude_and,
         "exclude_or": self.exclude_or,
         "include_str": self.include_str,
         "include_str_p": self.include_str_p,
         "exclude_str_p": self.exclude_str_p,
         "intersec_share": self.intersec_share
     }
     self.model = loader_obj
     self.tokenizer = TokenizerApply(self.model)
     self.tknz_model = self.tokenizer.model_tokenize()
コード例 #5
0
    def __init__(self, loader_obj):
        self.model = loader_obj
        assert self.model.model_type == "lsi", "тип модели не соответствует классу SimpleRules"

        self.tknz = TokenizerApply(self.model)
        if 'index' in self.model.texts_algorithms:
            self.index = self.model.texts_algorithms['index']
        else:
            self.et_vectors = self.tknz.texts_processing(
                self.model.application_field["texts"])
            self.index = MatrixSimilarity(
                self.et_vectors,
                num_features=self.model.texts_algorithms["num_topics"])

        self.coeffs = self.model.application_field["coeff"]
        self.tags = self.model.application_field["tags"]
コード例 #6
0
        "lingvo": [{"synonyms": [[]], "tokenize": False},
        {"ngrams": [[]], "tokenize": False},
        {"stopwords": [[]], "tokenize": False},
        {"workwords": [[]], "tokenize": False}],
        "classificator_algorithms": {},
        "texts_algorithms": {},
        "tokenizer": "SimpleTokenizer"}

with open(os.path.join(models_rout, "simplest_model.pickle"), "bw") as f:
    pickle.dump(model, f)
"""

with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f:
    model = pickle.load(f)

tzapl = TokenizerApply(Loader(model))
# tx = "вчера нам пожелали доброго вечера 345 раз"

tz_txs = tzapl.texts_processing(train_df["words"])
print(tz_txs[:10])
print(len(tz_txs))

# подготовка списка синонимов:
"""
stopwords_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'stopwords.csv'))
lingv_rules_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'lingv_rules.csv'))
ngrams_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'ngrams.csv'))
texts_collection_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'texts_collection.tsv'), sep = '\t')
rl_ans_df = pd.read_csv(os.path.join(data_rout, 'bss_data', "rules_answers.csv"))
test_acc_df = pd.read_csv(os.path.join(data_rout, 'bss_data', "test_accuracy.csv"))
コード例 #7
0
print(quests50th_df[:100])

etalons_df = pd.read_csv(
    os.path.join(data_rout, "kosgu_data", "lingv_rules.csv"))
print(etalons_df["words"][:100])
print(etalons_df.shape)

train_df = pd.DataFrame(
    pd.concat([quests50th_df["words"], etalons_df["words"]], axis=0))
print('\n', train_df)
print(train_df.shape)

with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f:
    model = pickle.load(f)

tknz_txts = TokenizerApply(Loader(model))
# tx = "вчера нам пожелали доброго вечера 345 раз"

tz_txs = tknz_txts.texts_processing(list(train_df["words"]))
print(tz_txs[:10])
print(len(tz_txs))

# подготовка списка синонимов:
stopwords_df = pd.read_csv(
    os.path.join(data_rout, 'kosgu_data', 'stopwords.csv'))
lingv_rules_df = pd.read_csv(
    os.path.join(data_rout, 'kosgu_data', 'lingv_rules.csv'))
ngrams_df = pd.read_csv(os.path.join(data_rout, 'kosgu_data', 'ngrams.csv'))

sinonims_files = ['01_sinonims.csv', '02_sinonims.csv']
synonyms = []
コード例 #8
0
# Тестируем под задачу "быстрых ответов" первоначальные вопросы (на которых сеть обучалась) используем в качестве
# эталонов, смотрим, насколько качественно она отбирает во входящем потоке похожие на них вопросы
import os, pickle, time
from utility import Loader
from texts_processors import TokenizerApply
import pandas as pd

# загрузка файлов с данными:
tokenize_path = r'./tokenize_model'
test_path = r'./test'

with open(os.path.join(tokenize_path, "tokenizator_model.pickle"), "rb") as f:
    tokenize_model = pickle.load(f)
    tokenize_loader = Loader(tokenize_model)

tknz = TokenizerApply(tokenize_loader)

# загрузка вопросов
df_data = pd.read_csv(os.path.join(test_path, "ндс_прибыль_5000.csv"))
df_data.rename(columns={"0": "text"}, inplace=True)

# загрузка словаря, который "знает" нейронная сеть
work_dict_df = pd.read_csv(os.path.join(test_path, "dictionary_work.csv"))
work_dict_list = list(work_dict_df["token"])
print(work_dict_list)

# загрузка эталонов (первоначальных запросов, на которых обучалась нейронная сеть)
df_etalons = pd.read_csv(os.path.join(test_path, "etalons.csv"))

df_etalons = df_data
tktxs = tknz.texts_processing(df_data["text"])
コード例 #9
0
import os, pickle
import pandas as pd
import random
from texts_processors import TokenizerApply
from utility import Loader

data_rout = r'./data'
models_rout = r'./models'

with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f:
    lingv_model = pickle.load(f)

tk_appl = TokenizerApply(Loader(lingv_model))
data_df = pd.read_csv(os.path.join(data_rout, "data_group_01.csv"))
lemm_txts_l = tk_appl.texts_processing(list(data_df['text']))
lemm_txts_df = pd.DataFrame(list(zip([" ".join(x) for x in lemm_txts_l], data_df['group'])))
lemm_txts_df.rename(columns={0: 'text', 1: 'group'}, inplace=True)
print(lemm_txts_df)

lemm_txts_df.to_csv(os.path.join(data_rout, "lemm_data_group_01.csv"), index=False, columns=['text', 'group'])
df = pd.read_csv(os.path.join(data_rout, "lemm_data_group_01.csv"))
print(df)


# герерация пар семантически одинаковых вопросов
lbs = set(df['group'])
results_tuples = []
for lb in lbs:
    work_list = list(df['text'][df['group'] == lb])
    for tx1 in work_list:
        for tx2 in work_list:
コード例 #10
0
 def __init__(self, loader_obj):
     self.model_types = [("siamese_lstm_d2v", None)]
     self.model = loader_obj
     self.tknz = TokenizerApply(self.model)
     self.tkz_model = self.tknz.model_tokenize()