Exemple #1
0
def calc_avg_distance_matrix(graph: gc.Graph, removed_nodes: [int],
                             save_info: sl.MemoryAccess):
    if save_info.has_avg_distance_matrix(removed_nodes=removed_nodes):
        save_info.delete_distance_matrices(removed_nodes=removed_nodes)
        return save_info.load_avg_distance_matrix(removed_nodes)

    used_embeddings = range(save_info.get_num_iterations())

    avg_dm = pd.DataFrame(0.0, index=graph.nodes(), columns=graph.nodes())

    dm_calc_func = functools.partial(__calc_dm, graph, removed_nodes,
                                     save_info)

    for iter in used_embeddings:
        res = dm_calc_func(iter)
        i, dm = res
        utils.assure_same_labels([avg_dm, dm],
                                 "Format of distance matrix iteration {} \
                                 for removed nodes  {} is not correct".format(
                                     i, removed_nodes))
        avg_dm += dm

    avg_dm = avg_dm.div(len(used_embeddings))
    # save avg distance matrix
    save_info.save_avg_distance_matrix(removed_nodes, avg_dm)
    # delete dms for memory space
    save_info.delete_distance_matrices(removed_nodes=removed_nodes)
    return avg_dm
Exemple #2
0
def compute_training_features(save_info: sl.MemoryAccess,
                              graph: gc.Graph,
                              list_nodes_to_predict: [int],
                              nodes_to_train_on: {},
                              num_of_bins: int,
                              feature_type: ft.FeatureType = None,
                              num_eval_iterations: int = None):
    """
    :param save_info: memory access obj
    :param graph: graph the embedding is trained on (used to access nodes lists)
    :param num_of_bins: number of bins the feature vector should use
    :param feature_type: type of the features to compute
    :param list_nodes_to_predict: nodes that are used as test_cases.
            If None nodes are determined by available files in the file system
    :param nodes_to_train_on: nodes that are used for training in each test case. Dict from the node_to_predict to [int]
            containin the training nodes for that tested node.
            If None
    """

    print(
        f"Compute training features on diff type {save_info.get_diff_type()} and graph {str(graph)} "
        f"on nodes {list_nodes_to_predict} "
        f" graph  embedding {str(save_info.embedding_type)}")

    if save_info.get_diff_type().has_one_init_graph():
        if num_eval_iterations is None:
            iteration_values = list(range(save_info.get_num_iterations()))
        else:
            iteration_values = list(range(num_eval_iterations))
    else:
        iteration_values = [-1]

    if feature_type is None:
        feature_type = ft.FeatureType.DIFF_BIN_WITH_DIM

    for diff_iter in iteration_values:
        if diff_iter != -1:
            save_info.get_diff_type().set_iter(diff_iter)

        p_nodes = list_nodes_to_predict
        t_nodes = nodes_to_train_on

        if save_info.get_diff_type() in [
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF,
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS,
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT,
                dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE
        ]:
            exp_sim_diff.compute_training_features_from_similarity_diff(
                save_info=save_info,
                graph=graph,
                num_of_bins=num_of_bins,
                feature_type=feature_type,
                p_nodes=p_nodes,
                t_nodes=t_nodes)
        else:

            num_features = len(p_nodes)

            p_nodes, t_nodes = exp_utils.filter_by_already_trained_nodes(
                p_node_list=p_nodes,
                t_node_dict=t_nodes,
                graph=graph,
                save_info=save_info,
                feature_type=feature_type,
                num_bins=num_of_bins)

            if len(p_nodes) > 0:
                # compute distance matrix of the original graph
                if save_info.get_diff_type(
                ) == dt.DiffType.DIFFERENCE_ONE_INIT:
                    emb_number = save_info.get_diff_type().get_iter()
                    if emb_number == -1 or emb_number is None:
                        raise ValueError(
                            f"The selected Difference Type requires an iteration number. "
                            f"E.g. dt.DiffType.DIFFERENCE_ONE_INIT.set_iter(0)."
                        )

                    _, dm_original = __calc_dm(graph=graph,
                                               removed_nodes=[],
                                               save_info=save_info,
                                               i=emb_number)
                elif save_info.get_diff_type() == dt.DiffType.DIFFERENCE:
                    dm_original = calc_avg_distance_matrix(graph=graph,
                                                           removed_nodes=[],
                                                           save_info=save_info)
                else:
                    raise ValueError(
                        f"Invalid Difference Type: {save_info.get_diff_type()}"
                    )

                func_p = functools.partial(
                    __compute_training_features_for_one_node_pool, dm_original,
                    save_info, graph, num_of_bins, feature_type, t_nodes)

                with multiprocessing.Pool(min(config.NUM_CORES,
                                              len(p_nodes))) as pool:
                    for res in pool.imap(func_p, p_nodes):
                        pass
                '''
                for i in p_nodes:
                    func_p(i)
                '''
            else:
                if num_features == 0:
                    raise ValueError(
                        "no embeddings found to create training features for")
                else:
                    print(
                        f"All features are already trained. Number of training features {num_features}"
                    )
Exemple #3
0
def compute_training_features_from_similarity_diff(
        save_info: sl.MemoryAccess,
        graph: gc.Graph,
        num_of_bins: int,
        feature_type: ft.FeatureType = None,
        p_nodes: [int] = None,
        t_nodes: {} = None):
    """
    Computes training and test features generated from difference matrix generated by similarity diff matrix
    :param save_info: memory access obj
    :param graph: graph the embedding is trained on (used to access nodes lists)
    :param num_of_bins: number of bins the feature vector should use
    :param feature_type: type of the features to compute
    :param p_nodes: nodes that are used as test_cases.
            If None nodes are determined by available files in the file system
    :param t_nodes: nodes that are used for training in each test case. Dict from the node_to_predict to [int]
            containin the training nodes for that tested node.
            If None
    """
    print('start sequence part')
    start_squence = time.time()

    num_features = len(p_nodes)

    p_nodes, t_nodes = exp_utils.filter_by_already_trained_nodes(
        p_node_list=p_nodes,
        t_node_dict=t_nodes,
        graph=graph,
        save_info=save_info,
        feature_type=feature_type,
        num_bins=num_of_bins)

    if len(p_nodes) > 0:
        if save_info.get_diff_type().has_iteration():
            assert (save_info.get_diff_type().has_one_init_graph())
            iteration = save_info.get_diff_type().get_iter()
            o_dm_list = dmm.load_dms(removed_nodes=[],
                                     save_info=save_info,
                                     num_iterations=1,
                                     use_specific_iter=iteration)
        else:
            o_dm_list = dmm.load_dms(
                removed_nodes=[],
                save_info=save_info,
                num_iterations=save_info.get_num_iterations())

        func_p = functools.partial(compute_training_features_for_one_node_pool,
                                   save_info, graph, num_of_bins, feature_type,
                                   t_nodes, list(o_dm_list))

        end_sequence = time.time()
        print(f'Sequence duration {end_sequence - start_squence}')
        start_pool = time.time()
        with multiprocessing.Pool(min(config.NUM_CORES, len(p_nodes))) as pool:
            for res in pool.imap(func_p, p_nodes):
                pass
        end_pool = time.time()
        print(f'Pool duration {end_pool - start_pool}')
        '''
        for i in list_nodes_to_predict:
            func_p(i)
        '''

    else:
        if num_features == 0:
            print("no embeddings found to create training features for")
        else:
            print(
                f"All features are already trained. Number of training features {num_features}"
            )
Exemple #4
0
def compute_training_features_for_one_node_pool(
        save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int,
        feature_type: ft.FeatureType, nodes_to_train_on: {},
        o_dm_list: [pd.DataFrame], node_to_predict: int):
    '''
    Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph
    :param save_info:
    :param graph:
    :param num_of_bins:
    :param feature_type:
    :param nodes_to_train_on:
    :param node_to_predict:
    :return:
    '''

    num_iter = save_info.get_num_iterations()

    quantity_dict = {
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS:
        [num_iter, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE:
        [1, num_iter, num_iter]
    }

    quantity = quantity_dict[save_info.get_diff_type()]

    used_emb = save_info.get_diff_type().get_iter()

    # compute attack features
    diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict],
                                             save_info=save_info,
                                             quantity_first=quantity[0],
                                             quantity_second=quantity[1],
                                             used_emb=used_emb,
                                             o_dm_list=o_dm_list)
    cf.create_features(diff=diff,
                       removed_nodes=[node_to_predict],
                       original_graph=graph,
                       num_of_bins=num_of_bins,
                       feature_type=feature_type,
                       save_info=save_info)

    # compute training features
    if save_info.is_diff_type(
            dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE):
        # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'')
        o_dm_list_t = [min_r_dm]
        quantity[1] = 1
    else:
        o_dm_list_t = None

    g_prime = graph.delete_node(removed_node=node_to_predict)
    for tr_node in nodes_to_train_on[node_to_predict]:
        removed_nodes = [node_to_predict, tr_node]

        diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes,
                                          save_info=save_info,
                                          quantity_first=quantity[1],
                                          quantity_second=quantity[2],
                                          used_emb=used_emb,
                                          o_dm_list=o_dm_list_t)
        cf.create_features(diff=diff,
                           removed_nodes=removed_nodes,
                           original_graph=g_prime,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)
def test(save_info: sl.MemoryAccess,
         graph: gc.Graph,
         feature_type: ft.FeatureType,
         num_of_bins: int,
         list_nodes_to_predict: List[int],
         nodes_to_train_on: Dict[int, List[int]],
         classifier: [] = None,
         sampling_strategy=None,
         save: bool = True,
         limit_num_training_graphs: int = 10,
         check_for_existing: bool = True,
         num_eval_iterations: int = None):
    if save_info.get_diff_type().has_one_init_graph():
        if num_eval_iterations is None:
            diff_iter = range(save_info.get_num_iterations())
        else:
            diff_iter = range(num_eval_iterations)
    else:
        diff_iter = [-1]

    for i in diff_iter:
        if i >= 0:
            save_info.get_diff_type().set_iter(i)

        if classifier is None:
            classifier = [
                KNeighborsClassifier(),
                SVC(kernel="linear", probability=True),
                DecisionTreeClassifier(),
                RandomForestClassifier(),
                AdaBoostClassifier(),
                GaussianNB()
            ]  # , MLPClassifier()]

        target_overall_file_name = get_overall_results_name(
            feature_type=feature_type.to_str(num_of_bins),
            sampling_strategy=sampling_strategy,
            diff_type=save_info.diff_type,
            num_iterations=save_info.num_iterations,
            num_tr_graphs_limit=limit_num_training_graphs)
        # check if embedding is already trained

        if check_for_existing and full_test_results_available(
                target_overall_file_name=target_overall_file_name,
                save_info=save_info,
                classifier=classifier):
            continue
        """
        if list_nodes_to_predict is None:
            raise ValueError("Safty Error: the list of nodes to predict is not given.")
            list_nodes_to_predict = save_info.get_list_of_available_training_data(graph=graph,
                                                                                  feature_type=feature_type,
                                                                                  num_of_bins=num_of_bins)
        """
        assert (len(list_nodes_to_predict) > 0)
        print(f"data is available for nodes: {list_nodes_to_predict}")

        overall_results = pd.DataFrame()

        for c in classifier:
            results_per_node = pd.DataFrame()

            # train_labels = []
            # train_predicted = []
            # train_probabilities = []
            # test_labels = []
            # test_predicted = []
            # test_probabilities = []

            exp_per_node = functools.partial(test_per_node, nodes_to_train_on,
                                             graph, save_info, feature_type,
                                             num_of_bins,
                                             limit_num_training_graphs,
                                             sampling_strategy, c)

            with multiprocessing.Pool(
                    min(config.NUM_CORES, len(list_nodes_to_predict))) as pool:
                for res in pool.imap(exp_per_node, list_nodes_to_predict):
                    results_per_node[res[1]] = res[0]

            if sampling_strategy:
                sampling_str = f"_sampling={sampling_strategy.to_str(num_of_bins)}"
            else:
                sampling_str = ""
            if limit_num_training_graphs is not None:
                str_limit = f"_num_tr_graphs_{limit_num_training_graphs}"
            else:
                str_limit = ""
            if save:
                save_info.save_test_results(
                    results_per_node.T,
                    f"TestResults_ft={feature_type.to_str(num_of_bins)}_Cassifier="
                    + str(save_info.diff_type) + str(c).split("(")[0] +
                    sampling_str +
                    f"_num_iterations_{save_info.num_iterations}" + str_limit)

            results_per_classifier = _create_test_results_over_all_experiments(
                results_per_node)

            overall_results[str(c).split("(")[0]] = pd.Series(
                results_per_classifier.T)

        if save:
            save_info.save_test_results(results=overall_results,
                                        name=target_overall_file_name)

        print(f"Graph {save_info.graph}, "
              f"emb {save_info.embedding_type}, "
              f"dt {save_info.get_diff_type().to_str()}, "
              f"ft {feature_type.to_str(num_of_bins)}, "
              f"limit_tr_graphs {limit_num_training_graphs}")
        print(overall_results)
Exemple #6
0
def train_embedding_per_graph(
        graph: gc.Graph,
        embedding: Embedding,
        save_info: sl.MemoryAccess,
        num_of_embeddings: int = 30,
        num_of_test_evaluations_per_degree_level: int = 5,
        num_of_training_graphs: int = 10,
        num_of_bins_for_tf: [int] = None,
        run_experiments_on_embedding: bool = True,
        feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM):
    assert (num_of_embeddings == save_info.get_num_iterations())
    if num_of_bins_for_tf is None:
        num_of_bins_for_tf = [10]
    elif isinstance(num_of_bins_for_tf, int):
        num_of_bins_for_tf = [num_of_bins_for_tf]

    embedding.train_embedding(graph=graph,
                              save_info=save_info,
                              removed_nodes=[],
                              num_of_embeddings=num_of_embeddings)

    first_started_embedding = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=True)

    tested_nodes = utils.sample_low_avg_high_degree_nodes(
        graph=graph,
        quantity=num_of_test_evaluations_per_degree_level,
        init_range=2,
        pref_list=first_started_embedding)
    print(f"\nTrain Embeddings for nodes {tested_nodes}")
    nodes_for_training_embedding = {}

    for index, first_node in enumerate(tested_nodes):
        # print(f"Start training embedding for {index}({first_node}). node.")
        graph_removed_one = graph.delete_node(first_node)
        embedding.train_embedding(graph=graph_removed_one,
                                  save_info=save_info,
                                  removed_nodes=[first_node],
                                  num_of_embeddings=num_of_embeddings)

        if num_of_training_graphs:

            second_completed_diffs = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=False)

            second_started_embedding = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=True)

            second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes(
                graph=graph_removed_one,
                pref_list=second_completed_diffs,
                secondary_pref_list=second_started_embedding,
                all_list=graph_removed_one.nodes(),
                quantity=num_of_training_graphs)
        else:
            second_tested_nodes = graph_removed_one.nodes()

        nodes_for_training_embedding[first_node] = second_tested_nodes

        # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}")
        for index2, second_node in enumerate(second_tested_nodes):
            # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.")
            graph_removed_two = graph_removed_one.delete_node(second_node)
            embedding.train_embedding(graph=graph_removed_two,
                                      save_info=save_info,
                                      removed_nodes=[first_node, second_node],
                                      num_of_embeddings=num_of_embeddings)

    # create features
    if run_experiments_on_embedding:

        for num_bins in num_of_bins_for_tf:
            # try:
            cf.compute_training_features(
                save_info=save_info,
                graph=graph,
                num_of_bins=num_bins,
                list_nodes_to_predict=tested_nodes,
                nodes_to_train_on=nodes_for_training_embedding,
                feature_type=feature_type)
            te.test(save_info=save_info,
                    graph=graph,
                    feature_type=feature_type,
                    num_of_bins=num_bins,
                    limit_num_training_graphs=num_of_training_graphs,
                    list_nodes_to_predict=tested_nodes,
                    nodes_to_train_on=nodes_for_training_embedding)
            # except Exception as e:
            #  print(f"Failed to compute Training Features or Test. "
            #          f"graph {str(graph)}, "
            #          f"emb {str(embedding)}, "
            #          f"num_bins {num_bins}")
            #   traceback.print_exc()

    return tested_nodes, nodes_for_training_embedding