def __init__(self, loader_obj): self.functions_dict = { "include_and": include_and, "include_or": include_or, "exclude_and": exclude_and, "exclude_or": exclude_or, "include_str": include_str, "include_str_p": include_str_p, "exclude_str_p": exclude_str_p, "intersec_share": intersec_share } self.model = loader_obj assert self.model.model_type == "simple_rules", "тип модели не соответствует классу SimpleRules" self.tokenizer = TokenizerApply(self.model) if not self.model.is_etalons_lemmatize: self.model.application_field[ "texts"] = self.tokenizer.texts_processing( self.model.application_field["texts"]) self.model_params = list( zip(self.model.application_field["tags"], self.model.application_field["rules"], self.model.application_field["texts"], self.model.application_field["coeff"])) # grouping rules with the same tag self.model_params_grouped = model_params_grouped(self.model_params)
def init(): # тут поднять всякие модели и пр. models_rout = os.path.dirname(__file__) # загрузка моделей для каждого pubid и формирование словаря, в котором pubid являются ключами, # значениями классифицирующие модели и связки тегов с айди ответов (и модулей) models_dict = {"simple_rules_model": None} # загрузка моделей и превращение их сразу в loader_obj for model_name in models_dict: with open( os.path.join(models_rout, 'models/tax_tags', str(model_name) + ".pickle"), "br") as f: model = pickle.load(f) models_dict[model_name] = model # загрузим лемматизатор для паттернов: tknz = TokenizerApply(Loader(models_dict["simple_rules_model"])) global pattern1, pattern2 # лемматизируем паттерны для обора фрагментов pattern1 = tknz.texts_processing(["в ходе проведения"])[0] pattern2 = tknz.texts_processing(["В течение 5 <4> рабочих дней"])[0] """определение моделей, которые потом используются для разных pubid""" model_1 = ModelsChain([(SimpleRules, models_dict["simple_rules_model"])]) global pub_models pub_models = { 1: { "model": model_1, "tag_answ_link": None, "tokenizer": tknz } }
def __init__(self, loader_obj): self.model_types = [("lsi", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize() self.et_vectors = self.tkz_model.application_field["texts"] self.coeffs = self.tkz_model.application_field["coeff"] self.tags = self.tkz_model.application_field["tags"] self.index = Similarity( None, self.et_vectors, num_features=self.model.texts_algorithms["num_topics"])
def __init__(self, loader_obj): self.model_types = [("simple_rules", None)] self.functions_dict = { "include_and": self.include_and, "include_or": self.include_or, "exclude_and": self.exclude_and, "exclude_or": self.exclude_or, "include_str": self.include_str, "include_str_p": self.include_str_p, "exclude_str_p": self.exclude_str_p, "intersec_share": self.intersec_share } self.model = loader_obj self.tokenizer = TokenizerApply(self.model) self.tknz_model = self.tokenizer.model_tokenize()
def __init__(self, loader_obj): self.model = loader_obj assert self.model.model_type == "lsi", "тип модели не соответствует классу SimpleRules" self.tknz = TokenizerApply(self.model) if 'index' in self.model.texts_algorithms: self.index = self.model.texts_algorithms['index'] else: self.et_vectors = self.tknz.texts_processing( self.model.application_field["texts"]) self.index = MatrixSimilarity( self.et_vectors, num_features=self.model.texts_algorithms["num_topics"]) self.coeffs = self.model.application_field["coeff"] self.tags = self.model.application_field["tags"]
"lingvo": [{"synonyms": [[]], "tokenize": False}, {"ngrams": [[]], "tokenize": False}, {"stopwords": [[]], "tokenize": False}, {"workwords": [[]], "tokenize": False}], "classificator_algorithms": {}, "texts_algorithms": {}, "tokenizer": "SimpleTokenizer"} with open(os.path.join(models_rout, "simplest_model.pickle"), "bw") as f: pickle.dump(model, f) """ with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f: model = pickle.load(f) tzapl = TokenizerApply(Loader(model)) # tx = "вчера нам пожелали доброго вечера 345 раз" tz_txs = tzapl.texts_processing(train_df["words"]) print(tz_txs[:10]) print(len(tz_txs)) # подготовка списка синонимов: """ stopwords_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'stopwords.csv')) lingv_rules_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'lingv_rules.csv')) ngrams_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'ngrams.csv')) texts_collection_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'texts_collection.tsv'), sep = '\t') rl_ans_df = pd.read_csv(os.path.join(data_rout, 'bss_data', "rules_answers.csv")) test_acc_df = pd.read_csv(os.path.join(data_rout, 'bss_data', "test_accuracy.csv"))
print(quests50th_df[:100]) etalons_df = pd.read_csv( os.path.join(data_rout, "kosgu_data", "lingv_rules.csv")) print(etalons_df["words"][:100]) print(etalons_df.shape) train_df = pd.DataFrame( pd.concat([quests50th_df["words"], etalons_df["words"]], axis=0)) print('\n', train_df) print(train_df.shape) with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f: model = pickle.load(f) tknz_txts = TokenizerApply(Loader(model)) # tx = "вчера нам пожелали доброго вечера 345 раз" tz_txs = tknz_txts.texts_processing(list(train_df["words"])) print(tz_txs[:10]) print(len(tz_txs)) # подготовка списка синонимов: stopwords_df = pd.read_csv( os.path.join(data_rout, 'kosgu_data', 'stopwords.csv')) lingv_rules_df = pd.read_csv( os.path.join(data_rout, 'kosgu_data', 'lingv_rules.csv')) ngrams_df = pd.read_csv(os.path.join(data_rout, 'kosgu_data', 'ngrams.csv')) sinonims_files = ['01_sinonims.csv', '02_sinonims.csv'] synonyms = []
# Тестируем под задачу "быстрых ответов" первоначальные вопросы (на которых сеть обучалась) используем в качестве # эталонов, смотрим, насколько качественно она отбирает во входящем потоке похожие на них вопросы import os, pickle, time from utility import Loader from texts_processors import TokenizerApply import pandas as pd # загрузка файлов с данными: tokenize_path = r'./tokenize_model' test_path = r'./test' with open(os.path.join(tokenize_path, "tokenizator_model.pickle"), "rb") as f: tokenize_model = pickle.load(f) tokenize_loader = Loader(tokenize_model) tknz = TokenizerApply(tokenize_loader) # загрузка вопросов df_data = pd.read_csv(os.path.join(test_path, "ндс_прибыль_5000.csv")) df_data.rename(columns={"0": "text"}, inplace=True) # загрузка словаря, который "знает" нейронная сеть work_dict_df = pd.read_csv(os.path.join(test_path, "dictionary_work.csv")) work_dict_list = list(work_dict_df["token"]) print(work_dict_list) # загрузка эталонов (первоначальных запросов, на которых обучалась нейронная сеть) df_etalons = pd.read_csv(os.path.join(test_path, "etalons.csv")) df_etalons = df_data tktxs = tknz.texts_processing(df_data["text"])
import os, pickle import pandas as pd import random from texts_processors import TokenizerApply from utility import Loader data_rout = r'./data' models_rout = r'./models' with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f: lingv_model = pickle.load(f) tk_appl = TokenizerApply(Loader(lingv_model)) data_df = pd.read_csv(os.path.join(data_rout, "data_group_01.csv")) lemm_txts_l = tk_appl.texts_processing(list(data_df['text'])) lemm_txts_df = pd.DataFrame(list(zip([" ".join(x) for x in lemm_txts_l], data_df['group']))) lemm_txts_df.rename(columns={0: 'text', 1: 'group'}, inplace=True) print(lemm_txts_df) lemm_txts_df.to_csv(os.path.join(data_rout, "lemm_data_group_01.csv"), index=False, columns=['text', 'group']) df = pd.read_csv(os.path.join(data_rout, "lemm_data_group_01.csv")) print(df) # герерация пар семантически одинаковых вопросов lbs = set(df['group']) results_tuples = [] for lb in lbs: work_list = list(df['text'][df['group'] == lb]) for tx1 in work_list: for tx2 in work_list:
def __init__(self, loader_obj): self.model_types = [("siamese_lstm_d2v", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize()