class LsiClassifier(AbstractRules): def __init__(self, loader_obj): self.model = loader_obj assert self.model.model_type == "lsi", "тип модели не соответствует классу SimpleRules" self.tknz = TokenizerApply(self.model) if 'index' in self.model.texts_algorithms: self.index = self.model.texts_algorithms['index'] else: self.et_vectors = self.tknz.texts_processing( self.model.application_field["texts"]) self.index = MatrixSimilarity( self.et_vectors, num_features=self.model.texts_algorithms["num_topics"]) self.coeffs = self.model.application_field["coeff"] self.tags = self.model.application_field["tags"] # замечание: в эту функцию по хорошему надо добавить возможность нескольким правилам иметь разные эталоны # LSI модели, объединяемые через and def rules_apply(self, texts): text_vectors = self.tknz.texts_processing(texts) texts_tags_similarity = [] for num, text_vector in enumerate(text_vectors): trues_list_scores = [(tg, scr, cf) for tg, scr, cf in list( zip(self.tags, self.index[text_vector], self.coeffs)) if scr > cf] # отсортируем, чтобы выводить наиболее подходящие результаты (с наибольшим скором) trues = [ tg for tg, scr, cf in sorted( trues_list_scores, key=lambda x: x[1], reverse=True) ] texts_tags_similarity.append((num, trues)) return texts_tags_similarity
class LsiClassifier(AbstractRules): def __init__(self, loader_obj): self.model_types = [("lsi", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize() self.et_vectors = self.tkz_model.application_field["texts"] self.coeffs = self.tkz_model.application_field["coeff"] self.tags = self.tkz_model.application_field["tags"] self.index = Similarity( None, self.et_vectors, num_features=self.model.texts_algorithms["num_topics"]) def rules_apply(self, texts): text_vectors = self.tknz.texts_processing(texts) texts_tags_similarity = [] for num, text_vector in enumerate(text_vectors): trues = [(tg, True) for tg, scr, cf in list( zip(self.tags, self.index[text_vector], self.coeffs)) if scr > cf] falses = [(tg, False) for tg, scr, cf in list( zip(self.tags, self.index[text_vector], self.coeffs)) if scr < cf] texts_tags_similarity.append((num, trues + falses)) return texts_tags_similarity
class SiameseNnDoc2VecClassifier(AbstractRules): def __init__(self, loader_obj): self.model_types = [("siamese_lstm_d2v", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize() def rules_apply(self, texts): text_vectors = self.tknz.texts_processing(texts) et_vectors = self.tkz_model.application_field["texts"] coeffs = self.tkz_model.application_field["coeff"] tags = self.tkz_model.application_field["tags"] decisions = [] vcs_arr = np.array(et_vectors) global graph graph = tf.get_default_graph() for num, text_vector in enumerate(text_vectors): tx_tensor = np.array( [text_vector for i in range(vcs_arr.shape[0])]) tx_tensor = tx_tensor.reshape(vcs_arr.shape[0], vcs_arr.shape[1], 1) vcs_arr = vcs_arr.reshape(vcs_arr.shape[0], vcs_arr.shape[1], 1) with graph.as_default(): scores = self.model.classificator_algorithms[ "siamese_lstm_model"].predict([tx_tensor, vcs_arr]) trues = [(tg, True) for scr, cf, tg in zip(scores, coeffs, tags) if scr < cf] falses = [(tg, False) for scr, cf, tg in zip(scores, coeffs, tags) if scr > cf] decisions.append((num, trues + falses)) return decisions
def __init__(self, loader_obj): self.functions_dict = { "include_and": include_and, "include_or": include_or, "exclude_and": exclude_and, "exclude_or": exclude_or, "include_str": include_str, "include_str_p": include_str_p, "exclude_str_p": exclude_str_p, "intersec_share": intersec_share } self.model = loader_obj assert self.model.model_type == "simple_rules", "тип модели не соответствует классу SimpleRules" self.tokenizer = TokenizerApply(self.model) if not self.model.is_etalons_lemmatize: self.model.application_field[ "texts"] = self.tokenizer.texts_processing( self.model.application_field["texts"]) self.model_params = list( zip(self.model.application_field["tags"], self.model.application_field["rules"], self.model.application_field["texts"], self.model.application_field["coeff"])) # grouping rules with the same tag self.model_params_grouped = model_params_grouped(self.model_params)
def init(): # тут поднять всякие модели и пр. models_rout = os.path.dirname(__file__) # загрузка моделей для каждого pubid и формирование словаря, в котором pubid являются ключами, # значениями классифицирующие модели и связки тегов с айди ответов (и модулей) models_dict = {"simple_rules_model": None} # загрузка моделей и превращение их сразу в loader_obj for model_name in models_dict: with open( os.path.join(models_rout, 'models/tax_tags', str(model_name) + ".pickle"), "br") as f: model = pickle.load(f) models_dict[model_name] = model # загрузим лемматизатор для паттернов: tknz = TokenizerApply(Loader(models_dict["simple_rules_model"])) global pattern1, pattern2 # лемматизируем паттерны для обора фрагментов pattern1 = tknz.texts_processing(["в ходе проведения"])[0] pattern2 = tknz.texts_processing(["В течение 5 <4> рабочих дней"])[0] """определение моделей, которые потом используются для разных pubid""" model_1 = ModelsChain([(SimpleRules, models_dict["simple_rules_model"])]) global pub_models pub_models = { 1: { "model": model_1, "tag_answ_link": None, "tokenizer": tknz } }
def __init__(self, loader_obj): self.model_types = [("lsi", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize() self.et_vectors = self.tkz_model.application_field["texts"] self.coeffs = self.tkz_model.application_field["coeff"] self.tags = self.tkz_model.application_field["tags"] self.index = Similarity( None, self.et_vectors, num_features=self.model.texts_algorithms["num_topics"])
def __init__(self, loader_obj): self.model_types = [("simple_rules", None)] self.functions_dict = { "include_and": self.include_and, "include_or": self.include_or, "exclude_and": self.exclude_and, "exclude_or": self.exclude_or, "include_str": self.include_str, "include_str_p": self.include_str_p, "exclude_str_p": self.exclude_str_p, "intersec_share": self.intersec_share } self.model = loader_obj self.tokenizer = TokenizerApply(self.model) self.tknz_model = self.tokenizer.model_tokenize()
def __init__(self, loader_obj): self.model = loader_obj assert self.model.model_type == "lsi", "тип модели не соответствует классу SimpleRules" self.tknz = TokenizerApply(self.model) if 'index' in self.model.texts_algorithms: self.index = self.model.texts_algorithms['index'] else: self.et_vectors = self.tknz.texts_processing( self.model.application_field["texts"]) self.index = MatrixSimilarity( self.et_vectors, num_features=self.model.texts_algorithms["num_topics"]) self.coeffs = self.model.application_field["coeff"] self.tags = self.model.application_field["tags"]
"lingvo": [{"synonyms": [[]], "tokenize": False}, {"ngrams": [[]], "tokenize": False}, {"stopwords": [[]], "tokenize": False}, {"workwords": [[]], "tokenize": False}], "classificator_algorithms": {}, "texts_algorithms": {}, "tokenizer": "SimpleTokenizer"} with open(os.path.join(models_rout, "simplest_model.pickle"), "bw") as f: pickle.dump(model, f) """ with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f: model = pickle.load(f) tzapl = TokenizerApply(Loader(model)) # tx = "вчера нам пожелали доброго вечера 345 раз" tz_txs = tzapl.texts_processing(train_df["words"]) print(tz_txs[:10]) print(len(tz_txs)) # подготовка списка синонимов: """ stopwords_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'stopwords.csv')) lingv_rules_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'lingv_rules.csv')) ngrams_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'ngrams.csv')) texts_collection_df = pd.read_csv(os.path.join(data_rout, 'bss_data', 'texts_collection.tsv'), sep = '\t') rl_ans_df = pd.read_csv(os.path.join(data_rout, 'bss_data', "rules_answers.csv")) test_acc_df = pd.read_csv(os.path.join(data_rout, 'bss_data', "test_accuracy.csv"))
print(quests50th_df[:100]) etalons_df = pd.read_csv( os.path.join(data_rout, "kosgu_data", "lingv_rules.csv")) print(etalons_df["words"][:100]) print(etalons_df.shape) train_df = pd.DataFrame( pd.concat([quests50th_df["words"], etalons_df["words"]], axis=0)) print('\n', train_df) print(train_df.shape) with open(os.path.join(models_rout, "simplest_model.pickle"), "br") as f: model = pickle.load(f) tknz_txts = TokenizerApply(Loader(model)) # tx = "вчера нам пожелали доброго вечера 345 раз" tz_txs = tknz_txts.texts_processing(list(train_df["words"])) print(tz_txs[:10]) print(len(tz_txs)) # подготовка списка синонимов: stopwords_df = pd.read_csv( os.path.join(data_rout, 'kosgu_data', 'stopwords.csv')) lingv_rules_df = pd.read_csv( os.path.join(data_rout, 'kosgu_data', 'lingv_rules.csv')) ngrams_df = pd.read_csv(os.path.join(data_rout, 'kosgu_data', 'ngrams.csv')) sinonims_files = ['01_sinonims.csv', '02_sinonims.csv'] synonyms = []
# Тестируем под задачу "быстрых ответов" первоначальные вопросы (на которых сеть обучалась) используем в качестве # эталонов, смотрим, насколько качественно она отбирает во входящем потоке похожие на них вопросы import os, pickle, time from utility import Loader from texts_processors import TokenizerApply import pandas as pd # загрузка файлов с данными: tokenize_path = r'./tokenize_model' test_path = r'./test' with open(os.path.join(tokenize_path, "tokenizator_model.pickle"), "rb") as f: tokenize_model = pickle.load(f) tokenize_loader = Loader(tokenize_model) tknz = TokenizerApply(tokenize_loader) # загрузка вопросов df_data = pd.read_csv(os.path.join(test_path, "ндс_прибыль_5000.csv")) df_data.rename(columns={"0": "text"}, inplace=True) # загрузка словаря, который "знает" нейронная сеть work_dict_df = pd.read_csv(os.path.join(test_path, "dictionary_work.csv")) work_dict_list = list(work_dict_df["token"]) print(work_dict_list) # загрузка эталонов (первоначальных запросов, на которых обучалась нейронная сеть) df_etalons = pd.read_csv(os.path.join(test_path, "etalons.csv")) df_etalons = df_data tktxs = tknz.texts_processing(df_data["text"])
import os, pickle import pandas as pd import random from texts_processors import TokenizerApply from utility import Loader data_rout = r'./data' models_rout = r'./models' with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f: lingv_model = pickle.load(f) tk_appl = TokenizerApply(Loader(lingv_model)) data_df = pd.read_csv(os.path.join(data_rout, "data_group_01.csv")) lemm_txts_l = tk_appl.texts_processing(list(data_df['text'])) lemm_txts_df = pd.DataFrame(list(zip([" ".join(x) for x in lemm_txts_l], data_df['group']))) lemm_txts_df.rename(columns={0: 'text', 1: 'group'}, inplace=True) print(lemm_txts_df) lemm_txts_df.to_csv(os.path.join(data_rout, "lemm_data_group_01.csv"), index=False, columns=['text', 'group']) df = pd.read_csv(os.path.join(data_rout, "lemm_data_group_01.csv")) print(df) # герерация пар семантически одинаковых вопросов lbs = set(df['group']) results_tuples = [] for lb in lbs: work_list = list(df['text'][df['group'] == lb]) for tx1 in work_list: for tx2 in work_list:
class SimpleRules(AbstractRules): def __init__(self, loader_obj): self.model_types = [("simple_rules", None)] self.functions_dict = { "include_and": self.include_and, "include_or": self.include_or, "exclude_and": self.exclude_and, "exclude_or": self.exclude_or, "include_str": self.include_str, "include_str_p": self.include_str_p, "exclude_str_p": self.exclude_str_p, "intersec_share": self.intersec_share } self.model = loader_obj self.tokenizer = TokenizerApply(self.model) self.tknz_model = self.tokenizer.model_tokenize() def rules_apply(self, texts): decisions = [] # применим правило к токенизированным текстам: for num, tknz_tx in enumerate(self.tokenizer.texts_processing(texts)): decisions_temp = [] model_params = list( zip(self.tknz_model.application_field["tags"], self.tknz_model.application_field["rules"], self.tknz_model.application_field["texts"], self.tknz_model.application_field["coeff"])) # grouping rules with the same tag model_params_grouped = [(x, list(y)) for x, y in groupby( sorted(model_params, key=lambda x: x[0]), key=lambda x: x[0])] # оценка результатов применения правил для каждого тега (в каждой группе): for group, rules_list in model_params_grouped: decision = True for tg, rule, tknz_etalon, coeff in rules_list: decision = decision and self.functions_dict[rule]( tknz_etalon, tknz_tx, coeff) decisions_temp.append((group, decision)) decisions.append((num, decisions_temp)) return decisions def include_and(self, tokens_list, text_list, coeff=0.0): for token in tokens_list: if token not in text_list: return False return True def include_or(self, tokens_list, text_list, coeff=0.0): for token in tokens_list: if token in text_list: return True return False def exclude_and(self, tokens_list, text_list, coeff=0.0): for token in tokens_list: if token in text_list: return False return True def exclude_or(self, tokens_list, text_list, coeff=0.0): for token in tokens_list: if token not in text_list: return True return False def intersec_share(self, tokens_list, text_list, intersec_coeff=0.7): intersec_tks = intersection(tokens_list, text_list) if len(intersec_tks) / len(tokens_list) > intersec_coeff: return True else: return False # функция, анализирующая на вхождение в текст строки (последовательности токенов, а не токенов по-отдельности) def include_str(self, tokens_str, text_str, coeff=0.0): if tokens_str in text_str: return True else: return False def exclude_str(self, tokens_str, text_str, coeff=0.0): if tokens_str not in text_str: return True else: return False def include_str_p(self, tokens_list: list, txt_list: list, coeff): length = len(tokens_list) txts_split = [ txt_list[i:i + length] for i in range(0, len(txt_list), 1) if len(txt_list[i:i + length]) == length ] for tx_l in txts_split: if strings_similarities( ' '.join(tokens_list), ' '.join(tx_l)) >= coeff: # self.sims_score: return True return False def exclude_str_p(self, tokens_list: list, txt_list: list, coeff): length = len(tokens_list) txts_split = [ txt_list[i:i + length] for i in range(0, len(txt_list), 1) if len(txt_list[i:i + length]) == length ] for tx_l in txts_split: if strings_similarities( ' '.join(tokens_list), ' '.join(tx_l)) >= coeff: # self.sims_score: return False return True
def __init__(self, loader_obj): self.model_types = [("siamese_lstm_d2v", None)] self.model = loader_obj self.tknz = TokenizerApply(self.model) self.tkz_model = self.tknz.model_tokenize()
models_rout = r"./models" # load models: d2v_model = Doc2Vec.load( os.path.join(models_rout, 'bss_doc2vec_model_20200611_draft')) print("d2v_model load Done") keras.losses.contrastive_loss = contrastive_loss lstm_model = load_model( os.path.join(models_rout, 'siamese_model_d2v_nn_2020_0612.h5')) print("lstm_model load Done") with open(os.path.join(models_rout, "tokenizator_model.pickle"), "br") as f: lingv_model = pickle.load(f) tk_appl = TokenizerApply(Loader(lingv_model)) tx1 = "сдавать ндс" tx2 = "сдавать ндфл" # tx1 = 'срок камеральной проверки по ндс заявленной к вычету' # tx2 = 'срок камеральной проверке по ндс' ts1 = tk_appl.texts_processing([tx1]) ts2 = tk_appl.texts_processing([tx2]) print(ts1, ts2) for t1 in ts1: for t2 in ts2: d2v_vec1 = d2v_model.infer_vector(ts1[0]) d2v_vec2 = d2v_model.infer_vector(ts2[0]) v1 = d2v_vec1.reshape(1, 300, 1)
class SimpleRules(AbstractRules): def __init__(self, loader_obj): self.functions_dict = { "include_and": include_and, "include_or": include_or, "exclude_and": exclude_and, "exclude_or": exclude_or, "include_str": include_str, "include_str_p": include_str_p, "exclude_str_p": exclude_str_p, "intersec_share": intersec_share } self.model = loader_obj assert self.model.model_type == "simple_rules", "тип модели не соответствует классу SimpleRules" self.tokenizer = TokenizerApply(self.model) if not self.model.is_etalons_lemmatize: self.model.application_field[ "texts"] = self.tokenizer.texts_processing( self.model.application_field["texts"]) self.model_params = list( zip(self.model.application_field["tags"], self.model.application_field["rules"], self.model.application_field["texts"], self.model.application_field["coeff"])) # grouping rules with the same tag self.model_params_grouped = model_params_grouped(self.model_params) # внешний метод: def rules_apply(self, texts, function_type="rules_apply_without_range"): if function_type == "rules_apply_without_range": return self.rules_apply_without_range(texts) elif function_type == "rules_apply_one": return self.rules_apply_one(texts) elif function_type == "rules_apply_range_one": return self.rules_apply_range_one(texts) elif function_type == "rules_apply_range": return self.rules_apply_range(texts) elif function_type == "rules_apply_range": return self.rules_apply_range(texts) elif function_type == "rules_apply_debugging": return self.rules_apply_debugging(texts) def rules_apply_without_range(self, texts): decisions = [] model_params_group = model_params_grouped(self.model_params) # применим правило к токенизированным текстам: for num, tknz_tx in enumerate(self.tokenizer.texts_processing(texts)): decisions_temp = [] # оценка результатов применения правил для каждого тега (в каждой группе): for group, rules_list in model_params_group: decision = True for tg, rule, tknz_etalon, coeff in rules_list: decision = decision and self.functions_dict[rule]( tknz_etalon, tknz_tx, coeff)[0] # будем возвращать только сработавшие правила (True) if decision: decisions_temp.append(group) decisions.append((num, decisions_temp)) return decisions # применение правил, когда мы точно знаем, что в эталонах одному правилу соответствует ровно одно условние def rules_apply_one(self, texts): decisions = [] # применим правило к токенизированным текстам: for num, tknz_tx in enumerate(self.tokenizer.texts_processing(texts)): decisions_temp = [] # оценка результатов применения правил для каждого тега (в каждой группе): for tg, rule, tknz_etalon, coeff in self.model_params: decision = self.functions_dict[rule](tknz_etalon, tknz_tx, coeff)[0] # будем возвращать только сработавшие правила (True) if decision: decisions_temp.append(decision) decisions.append((num, decisions_temp)) return decisions def rules_apply_range_one(self, texts): decisions = [] # применим правило к токенизированным текстам: for num, tknz_tx in enumerate(self.tokenizer.texts_processing(texts)): # оценка результатов применения правил для каждого тега (в каждой группе): decisions_temp = [] for tg, rule, tknz_etalon, coeff in self.model_params: decision = self.functions_dict[rule](tknz_etalon, tknz_tx, coeff) # будем возвращать только сработавшие правила (True) if decision[0]: decisions_temp.append(tuple((tg, decision[1]))) decisions.append((num, [ tg for tg, scr in sorted( decisions_temp, key=lambda x: x[1], reverse=True) ])) return decisions def rules_apply_range(self, texts): decisions = [] # применим правило к токенизированным текстам: for num, tknz_tx in enumerate(self.tokenizer.texts_processing(texts)): # оценка результатов применения правил для каждого тега (в каждой группе): decisions_temp = [] for group, rules_list in self.model_params_grouped: decision = True group_coeff = [] for tg, rule, tknz_etalon, coeff in rules_list: func_res = self.functions_dict[rule](tknz_etalon, tknz_tx, coeff) group_coeff.append(func_res[1]) decision = decision and func_res[0] # будем возвращать только сработавшие правила (True) if decision: decisions_temp.append(tuple((group, mean(group_coeff)))) decisions.append((num, [ tg for tg, scr in sorted( decisions_temp, key=lambda x: x[1], reverse=True) ])) return decisions def rules_apply_debugging(self, texts): decisions = [] # применим правило к токенизированным текстам: model_params_group = model_params_grouped(self.model_params) for num, tknz_tx in enumerate(self.tokenizer.texts_processing(texts)): decisions_temp = [] # оценка результатов применения правил для каждого тега (в каждой группе): for group, rules_list in model_params_group: for tg, rule, tknz_etalon, coeff in rules_list: decision = self.functions_dict[rule](tknz_etalon, tknz_tx, coeff) # будем возвращать только сработавшие правила (True) decisions_temp.append((group, decision)) decisions.append((num, decisions_temp)) return decisions