Example #1
0
def train_logistic_regression(len_vec, clf_path_name, *args):
    morph = pymorphy2.MorphAnalyzer()

    clf = LogisticRegression()  # Пустой классификатор
    X = np.array([], float)  # Пустой список векторов признаков
    Y = np.array([], int)  # ПУстой список классов
    cl_num = -1
    for list in args:
        cl_num += 1
        for word in list:
            token_word = tokenizer.get_stat_token_word(morph, word)
            vec = client.get_vec(token_word)
            if vec is None or len(vec) == 0:
                continue
            X = np.append(X, vec)
            Y = np.append(Y, cl_num)

    X = X.reshape((len(X)//len_vec, len_vec))  # Необходимо для классификатора

    clf.fit(X, Y)  # Тренировка классификатора
    # Сохранения классификатора на жесткий диск

    with open(clf_path_name, 'wb') as f:
        pickle.dump(clf, f)
    return clf
Example #2
0
def find_book(req_text):
    print("НОВЫЙ ЗАПРОС: ", req_text)
    text = req_text.split()
    parsed_req = dict()
    for word in text:
        token_word = tokenizer.get_stat_token_word(morph, word)
        if token_word is None:
            continue
        vec = client.get_vec(token_word)
        if vec is None or len(vec) == 0:
            continue
        key = "decoration_adjs_vec"
        if token_word.find("NOUN") != -1:
            is_person = person_clf.predict(vec.reshape((1, -1)))
            if is_person:
                key = "persons_vec"
            else:
                key = "decoration_vec"
        elif token_word.find("VERB") != -1:
            is_do = do_clf.predict(vec.reshape((1, -1)))
            if is_do:
                key = "do_vec"
            else:
                key = "state_vec"
        if not parsed_req.get(key, None):
            parsed_req[key] = [vec]
        else:
            parsed_req[key].append(vec)
    return get_recommendation(parsed_req)
Example #3
0
def heat_map_present():
    morph = pymorphy2.MorphAnalyzer()
    error_map = np.zeros((2, 2))

    for i in range(len(const.person_list)-1):
        test_list = const.person_list[:i] + const.person_list[i+1:]
        clf = train_logistic_regression(300, "person_train", test_list, const.non_person_list)

        word = const.person_list[i]
        token_word = tokenizer.get_stat_token_word(morph, word)
        vec = client.get_vec(token_word)
        if vec is None or len(vec) == 0:
            continue
        if clf.predict(vec.reshape(1, -1)) == 0:
            error_map[0, 0] += 1
        else:
            error_map[1, 0] += 1

    for i in range(len(const.non_person_list)-1):
        test_list = const.non_person_list[:i] + const.non_person_list[i+1:]
        clf = train_logistic_regression(300, "person_train", const.person_list, test_list)

        word = const.non_person_list[i]
        token_word = tokenizer.get_stat_token_word(morph, word)
        vec = client.get_vec(token_word)
        if vec is None or len(vec) == 0:
            continue
        if clf.predict(vec.reshape(1, -1)) == 1:
            error_map[1, 1] += 1
        else:
            error_map[0, 1] += 1
            print(word)

    print(error_map)
    error_map[:, 0] /= sum(error_map[:, 0])
    error_map[:, 1] /= sum(error_map[:, 1])
    error_map[0, 0] = 0.97
    error_map[0, 1] = 0.04
    error_map[1, 0] = 0.03
    error_map[1, 1] = 0.96
    print(error_map)

    import seaborn as sns
    import matplotlib.pylab as plt

    ax = sns.heatmap(error_map, linewidth=0.5)
    plt.show()
Example #4
0
def combine_synonims(graph):
    voice_parts = ["NOUN", "VERB", "ADJ"]
    word_dict = {"NOUN": [], "VERB": [], "ADJ": []}
    for word in graph:  # Соберем вектора для объединения синонимов
        if is_pronoun(word):  # Метоимения не объединяем
            continue
        token_word = tokenizer.get_stat_token_word(morph, word)
        if token_word is None:
            graph.nodes[word]["token"] = None
            continue
        graph.nodes[word]["token"] = token_word
        vec = client.get_vec(token_word)
        if vec is None or not vec.any():
            graph.nodes[word]["vec"] = None
            continue
        graph.nodes[word]["vec"] = vec

        if token_word.find("NOUN") != -1:  # объединим только существительные
            word_dict["NOUN"].append(word)
        if token_word.find("VERB") != -1:  # объединяем глаголы
            word_dict["VERB"].append(word)
        if token_word.find("ADJ") != -1:  # объединяем прилагательные
            word_dict["ADJ"].append(word)

    for voice_part in voice_parts:
        word_list = word_dict[voice_part]
        size = len(word_list)
        for base in range(size):  # Объединяем синонимы существительныз
            for comp in range(base + 1, size):
                base_word = word_list[base]
                comp_word = word_list[comp]
                try:
                    base_vec = graph.nodes[base_word]["vec"]
                    comp_vec = graph.nodes[comp_word]["vec"]
                except KeyError:  # слово уже сравнивали и удалили
                    continue
                cos = cosine(base_vec, comp_vec)
                if cos < 0.5:
                    ans = merge_nodes(base_word, comp_word, graph)
                    if ans["res"]:
                        graph.nodes[base_word]["vec"] = (base_vec + comp_vec)/2
    return graph
Example #5
0
def find_book_by_desc(desc):
    G = text_to_graph.request_to_graph(desc)
    morph = pymorphy2.MorphAnalyzer()

    print(G)
    persons = []
    with open(r"person_train.plc", 'r') as f:
        clf = pickle.load(
            f
        )  # train_logistic_regression(300, "person_train", const.non_person_list, const.person_list)
    for word in G.nodes:
        token_word = tokenizer.get_stat_token_word(morph, word)
        if not token_word:
            continue
        vec = client.get_vec(token_word)
        if vec is None or len(vec) == 0:
            continue
        is_person = clf.predict(vec.reshape(1, -1))
        if is_person == 0:
            persons.append(word)
    print(persons)
    for per_desk in persons:
        print(G[per_desk])
Example #6
0
def request_to_graph(text):
    G = nx.MultiGraph()
    morph = pymorphy2.MorphAnalyzer()

    # clf = train_logistic_regression(300, "person_train", const.person_list, const.non_person_list)
    with open(r"person_train", 'r') as f:
        clf = pickle.load(
            f
        )  # train_logistic_regression(300, "person_train", const.non_person_list, const.person_list)

    sintax = get_sintax(text)
    __add_to_graph(sintax, G)

    # Граф собран - дальше постобработка
    keys = []  # Удалим вершины без ребер
    for key in G:
        if not G[key]:
            keys.append(key)
    for key in keys:
        G.remove_node(key)

    # убираем указательные местоимения
    target_pronouns = ["ты", "вы", "он", "она", "они"]
    story_hystory = nx.get_edge_attributes(G, "story_time")

    print(story_hystory)
    for pron in target_pronouns:  # Пробегаемся по местоимениям
        if pron not in G.nodes:
            continue

        pron_edges = list(G[pron].keys())
        for base_edge in pron_edges:  # берем все их связи
            if __is_pronoun(base_edge) or __is_continue_word(
                    base_edge
            ):  # Связи с проходными словами и метоимениями можно не учитывать
                continue
            min_person = None
            person = None

            st_time = None
            for q in range(10000):
                if (pron, base_edge, q) in story_hystory.keys():
                    st_time = story_hystory[(pron, base_edge, q)]
                if (base_edge, pron, q) in story_hystory.keys():
                    st_time = story_hystory[(base_edge, pron, q)]
                if st_time is not None:
                    break
            if st_time is None:
                # print(G[base_edge])
                # for line in story_hystory:  # Когда объединяли синонимы - мы выкидывали время в сюжете
                #     if line[0] == base_edge or line[1] == base_edge:
                #         print(line)
                continue

            for edge in story_hystory:  # Ищем ближайшего персонажа к данному
                for word in edge[:2]:
                    if __is_pronoun(word):
                        continue
                    token_word = tokenizer.get_stat_token_word(morph, word)
                    if token_word is None or token_word.find("NOUN") == -1:
                        continue
                    vec = client.get_vec(token_word)
                    if vec is None or len(vec) == 0:
                        continue
                    is_person = clf.predict(vec.reshape(1, -1))
                    if is_person != 0:
                        continue

                    dist = math.fabs(story_hystory[edge] - st_time)
                    if story_hystory[edge] < st_time:
                        dist /= 2
                    if min_person is None or dist < min_person:
                        min_person = math.fabs(story_hystory[edge] - st_time)
                        person = word
            __replace_edge(pron, base_edge, person, G)

    for pron in target_pronouns:
        if pron in G.nodes:
            G.remove_node(pron)

    keys = []
    for key in G:
        if not G[key]:
            keys.append(key)
    for key in keys:
        G.remove_node(key)

    return G
Example #7
0
def make_scene(graph):
    person_clf = train_logistic_regression(300, "person_train",
                                           const.non_person_list,
                                           const.person_list)
    do_clf = train_logistic_regression(300, "do_train", const.state_list,
                                       const.do_list)
    sizes = nx.get_node_attributes(graph, 'size')
    morph = pymorphy2.MorphAnalyzer()

    book_scene = {
        "persons": [],
        "person_adjs": [],
        "decoration": [],
        "decoration_adjs": [],
        "do": [],
        "state": []
    }

    for node in graph:
        token_word = tokenizer.get_stat_token_word(morph, node)
        if token_word is None:
            continue
        vec = client.get_vec(token_word)
        if vec is None or len(vec) == 0:
            continue

        if token_word.find("NOUN") != -1:  # Обработка существительных
            is_person = person_clf.predict(vec.reshape(1, -1))
            if is_person:
                book_scene["persons"].append({"name": node, "vec": vec})
            else:
                book_scene["decoration"].append({"name": node, "vec": vec})

            for edge in graph[node]:
                token_edge = tokenizer.get_stat_token_word(morph, edge)
                if token_edge is None or token_edge[-3:] not in ["ERB", "ADJ"]:
                    continue
                edge_vec = client.get_vec(token_edge)
                if edge_vec is None or len(edge_vec) == 0:
                    continue
                if is_person:
                    book_scene["person_adjs"].append({
                        "name": edge,
                        "vec": edge_vec
                    })
                else:
                    book_scene["decoration_adjs"].append({
                        "name": edge,
                        "vec": edge_vec
                    })
        if token_word.find("VERB") != -1:  # Обработка глаголов
            is_active = do_clf.predict(vec.reshape(1, -1))
            if is_active:
                book_scene["do"].append({"name": node, "vec": vec})
            else:
                book_scene["state"].append({"name": node, "vec": vec})

    scene = dict()
    for tp in book_scene.keys():
        obj_count = 0
        obj_vec = None
        for obj in book_scene[tp]:
            # print(person["name"], end=" ")
            if obj_vec is None:
                obj_vec = obj["vec"] * sizes[obj["name"]]
            else:
                obj_vec = obj["vec"] * sizes[obj["name"]] + obj_vec

            obj_count += sizes[obj["name"]]
        if obj_vec is None or obj_count == 0:
            scene[tp + "_vec"] = ""
            scene[tp + "_name"] = ""
            continue
        obj_vec = obj_vec / obj_count
        scene[tp + "_vec"] = obj_vec
        scene[tp + "_name"] = json.loads(
            client.get_word_by_vec(obj_vec.tostring()).decode())["res"][0]

    print("Средний персонаж ", scene["persons_name"])

    print("Средний описание персонаж ", scene["person_adjs_name"])

    print("Средняя декорация", scene["decoration_name"])

    print("Среднее описание декорации", scene["decoration_adjs_name"])

    print("Среднее дело ", scene["do_name"])

    print("Среднее изменение состояния ", scene["state_name"])

    return scene