def train_logistic_regression(len_vec, clf_path_name, *args): morph = pymorphy2.MorphAnalyzer() clf = LogisticRegression() # Пустой классификатор X = np.array([], float) # Пустой список векторов признаков Y = np.array([], int) # ПУстой список классов cl_num = -1 for list in args: cl_num += 1 for word in list: token_word = tokenizer.get_stat_token_word(morph, word) vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue X = np.append(X, vec) Y = np.append(Y, cl_num) X = X.reshape((len(X)//len_vec, len_vec)) # Необходимо для классификатора clf.fit(X, Y) # Тренировка классификатора # Сохранения классификатора на жесткий диск with open(clf_path_name, 'wb') as f: pickle.dump(clf, f) return clf
def find_book(req_text): print("НОВЫЙ ЗАПРОС: ", req_text) text = req_text.split() parsed_req = dict() for word in text: token_word = tokenizer.get_stat_token_word(morph, word) if token_word is None: continue vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue key = "decoration_adjs_vec" if token_word.find("NOUN") != -1: is_person = person_clf.predict(vec.reshape((1, -1))) if is_person: key = "persons_vec" else: key = "decoration_vec" elif token_word.find("VERB") != -1: is_do = do_clf.predict(vec.reshape((1, -1))) if is_do: key = "do_vec" else: key = "state_vec" if not parsed_req.get(key, None): parsed_req[key] = [vec] else: parsed_req[key].append(vec) return get_recommendation(parsed_req)
def heat_map_present(): morph = pymorphy2.MorphAnalyzer() error_map = np.zeros((2, 2)) for i in range(len(const.person_list)-1): test_list = const.person_list[:i] + const.person_list[i+1:] clf = train_logistic_regression(300, "person_train", test_list, const.non_person_list) word = const.person_list[i] token_word = tokenizer.get_stat_token_word(morph, word) vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue if clf.predict(vec.reshape(1, -1)) == 0: error_map[0, 0] += 1 else: error_map[1, 0] += 1 for i in range(len(const.non_person_list)-1): test_list = const.non_person_list[:i] + const.non_person_list[i+1:] clf = train_logistic_regression(300, "person_train", const.person_list, test_list) word = const.non_person_list[i] token_word = tokenizer.get_stat_token_word(morph, word) vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue if clf.predict(vec.reshape(1, -1)) == 1: error_map[1, 1] += 1 else: error_map[0, 1] += 1 print(word) print(error_map) error_map[:, 0] /= sum(error_map[:, 0]) error_map[:, 1] /= sum(error_map[:, 1]) error_map[0, 0] = 0.97 error_map[0, 1] = 0.04 error_map[1, 0] = 0.03 error_map[1, 1] = 0.96 print(error_map) import seaborn as sns import matplotlib.pylab as plt ax = sns.heatmap(error_map, linewidth=0.5) plt.show()
def combine_synonims(graph): voice_parts = ["NOUN", "VERB", "ADJ"] word_dict = {"NOUN": [], "VERB": [], "ADJ": []} for word in graph: # Соберем вектора для объединения синонимов if is_pronoun(word): # Метоимения не объединяем continue token_word = tokenizer.get_stat_token_word(morph, word) if token_word is None: graph.nodes[word]["token"] = None continue graph.nodes[word]["token"] = token_word vec = client.get_vec(token_word) if vec is None or not vec.any(): graph.nodes[word]["vec"] = None continue graph.nodes[word]["vec"] = vec if token_word.find("NOUN") != -1: # объединим только существительные word_dict["NOUN"].append(word) if token_word.find("VERB") != -1: # объединяем глаголы word_dict["VERB"].append(word) if token_word.find("ADJ") != -1: # объединяем прилагательные word_dict["ADJ"].append(word) for voice_part in voice_parts: word_list = word_dict[voice_part] size = len(word_list) for base in range(size): # Объединяем синонимы существительныз for comp in range(base + 1, size): base_word = word_list[base] comp_word = word_list[comp] try: base_vec = graph.nodes[base_word]["vec"] comp_vec = graph.nodes[comp_word]["vec"] except KeyError: # слово уже сравнивали и удалили continue cos = cosine(base_vec, comp_vec) if cos < 0.5: ans = merge_nodes(base_word, comp_word, graph) if ans["res"]: graph.nodes[base_word]["vec"] = (base_vec + comp_vec)/2 return graph
def find_book_by_desc(desc): G = text_to_graph.request_to_graph(desc) morph = pymorphy2.MorphAnalyzer() print(G) persons = [] with open(r"person_train.plc", 'r') as f: clf = pickle.load( f ) # train_logistic_regression(300, "person_train", const.non_person_list, const.person_list) for word in G.nodes: token_word = tokenizer.get_stat_token_word(morph, word) if not token_word: continue vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue is_person = clf.predict(vec.reshape(1, -1)) if is_person == 0: persons.append(word) print(persons) for per_desk in persons: print(G[per_desk])
def request_to_graph(text): G = nx.MultiGraph() morph = pymorphy2.MorphAnalyzer() # clf = train_logistic_regression(300, "person_train", const.person_list, const.non_person_list) with open(r"person_train", 'r') as f: clf = pickle.load( f ) # train_logistic_regression(300, "person_train", const.non_person_list, const.person_list) sintax = get_sintax(text) __add_to_graph(sintax, G) # Граф собран - дальше постобработка keys = [] # Удалим вершины без ребер for key in G: if not G[key]: keys.append(key) for key in keys: G.remove_node(key) # убираем указательные местоимения target_pronouns = ["ты", "вы", "он", "она", "они"] story_hystory = nx.get_edge_attributes(G, "story_time") print(story_hystory) for pron in target_pronouns: # Пробегаемся по местоимениям if pron not in G.nodes: continue pron_edges = list(G[pron].keys()) for base_edge in pron_edges: # берем все их связи if __is_pronoun(base_edge) or __is_continue_word( base_edge ): # Связи с проходными словами и метоимениями можно не учитывать continue min_person = None person = None st_time = None for q in range(10000): if (pron, base_edge, q) in story_hystory.keys(): st_time = story_hystory[(pron, base_edge, q)] if (base_edge, pron, q) in story_hystory.keys(): st_time = story_hystory[(base_edge, pron, q)] if st_time is not None: break if st_time is None: # print(G[base_edge]) # for line in story_hystory: # Когда объединяли синонимы - мы выкидывали время в сюжете # if line[0] == base_edge or line[1] == base_edge: # print(line) continue for edge in story_hystory: # Ищем ближайшего персонажа к данному for word in edge[:2]: if __is_pronoun(word): continue token_word = tokenizer.get_stat_token_word(morph, word) if token_word is None or token_word.find("NOUN") == -1: continue vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue is_person = clf.predict(vec.reshape(1, -1)) if is_person != 0: continue dist = math.fabs(story_hystory[edge] - st_time) if story_hystory[edge] < st_time: dist /= 2 if min_person is None or dist < min_person: min_person = math.fabs(story_hystory[edge] - st_time) person = word __replace_edge(pron, base_edge, person, G) for pron in target_pronouns: if pron in G.nodes: G.remove_node(pron) keys = [] for key in G: if not G[key]: keys.append(key) for key in keys: G.remove_node(key) return G
def make_scene(graph): person_clf = train_logistic_regression(300, "person_train", const.non_person_list, const.person_list) do_clf = train_logistic_regression(300, "do_train", const.state_list, const.do_list) sizes = nx.get_node_attributes(graph, 'size') morph = pymorphy2.MorphAnalyzer() book_scene = { "persons": [], "person_adjs": [], "decoration": [], "decoration_adjs": [], "do": [], "state": [] } for node in graph: token_word = tokenizer.get_stat_token_word(morph, node) if token_word is None: continue vec = client.get_vec(token_word) if vec is None or len(vec) == 0: continue if token_word.find("NOUN") != -1: # Обработка существительных is_person = person_clf.predict(vec.reshape(1, -1)) if is_person: book_scene["persons"].append({"name": node, "vec": vec}) else: book_scene["decoration"].append({"name": node, "vec": vec}) for edge in graph[node]: token_edge = tokenizer.get_stat_token_word(morph, edge) if token_edge is None or token_edge[-3:] not in ["ERB", "ADJ"]: continue edge_vec = client.get_vec(token_edge) if edge_vec is None or len(edge_vec) == 0: continue if is_person: book_scene["person_adjs"].append({ "name": edge, "vec": edge_vec }) else: book_scene["decoration_adjs"].append({ "name": edge, "vec": edge_vec }) if token_word.find("VERB") != -1: # Обработка глаголов is_active = do_clf.predict(vec.reshape(1, -1)) if is_active: book_scene["do"].append({"name": node, "vec": vec}) else: book_scene["state"].append({"name": node, "vec": vec}) scene = dict() for tp in book_scene.keys(): obj_count = 0 obj_vec = None for obj in book_scene[tp]: # print(person["name"], end=" ") if obj_vec is None: obj_vec = obj["vec"] * sizes[obj["name"]] else: obj_vec = obj["vec"] * sizes[obj["name"]] + obj_vec obj_count += sizes[obj["name"]] if obj_vec is None or obj_count == 0: scene[tp + "_vec"] = "" scene[tp + "_name"] = "" continue obj_vec = obj_vec / obj_count scene[tp + "_vec"] = obj_vec scene[tp + "_name"] = json.loads( client.get_word_by_vec(obj_vec.tostring()).decode())["res"][0] print("Средний персонаж ", scene["persons_name"]) print("Средний описание персонаж ", scene["person_adjs_name"]) print("Средняя декорация", scene["decoration_name"]) print("Среднее описание декорации", scene["decoration_adjs_name"]) print("Среднее дело ", scene["do_name"]) print("Среднее изменение состояния ", scene["state_name"]) return scene