Esempio n. 1
0
def calc_avg_distance_matrix(graph: gc.Graph, removed_nodes: [int],
                             save_info: sl.MemoryAccess):
    if save_info.has_avg_distance_matrix(removed_nodes=removed_nodes):
        save_info.delete_distance_matrices(removed_nodes=removed_nodes)
        return save_info.load_avg_distance_matrix(removed_nodes)

    used_embeddings = range(save_info.get_num_iterations())

    avg_dm = pd.DataFrame(0.0, index=graph.nodes(), columns=graph.nodes())

    dm_calc_func = functools.partial(__calc_dm, graph, removed_nodes,
                                     save_info)

    for iter in used_embeddings:
        res = dm_calc_func(iter)
        i, dm = res
        utils.assure_same_labels([avg_dm, dm],
                                 "Format of distance matrix iteration {} \
                                 for removed nodes  {} is not correct".format(
                                     i, removed_nodes))
        avg_dm += dm

    avg_dm = avg_dm.div(len(used_embeddings))
    # save avg distance matrix
    save_info.save_avg_distance_matrix(removed_nodes, avg_dm)
    # delete dms for memory space
    save_info.delete_distance_matrices(removed_nodes=removed_nodes)
    return avg_dm
    def train_embedding(self, graph: gc.Graph, save_info: sl.MemoryAccess,
                        removed_nodes: [int], num_of_embeddings: int):
        super().train_embedding(graph=graph,
                                save_info=save_info,
                                removed_nodes=removed_nodes,
                                num_of_embeddings=num_of_embeddings)

        nx_g = graph.to_networkx()
        nx_g.to_directed()

        np.testing.assert_array_equal(nx_g.nodes(), graph.nodes())
        nx_g = nx.convert_node_labels_to_integers(nx_g)

        for iter in range(num_of_embeddings):
            if save_info.has_embedding(removed_nodes=removed_nodes,
                                       iteration=iter):
                continue

            Y, t = self.__gem_embedding.learn_embedding(graph=nx_g,
                                                        is_weighted=False,
                                                        no_python=True)

            emb = pd.DataFrame(Y, index=graph.nodes())

            save_info.save_embedding(removed_nodes=removed_nodes,
                                     iteration=iter,
                                     embedding=emb)
def test_per_node(nodes_to_train_on: List[int], graph: gc.Graph,
                  save_info: sl.MemoryAccess, feature_type: ft.FeatureType,
                  num_of_bins: int, limit_num_training_graphs: Optional[int],
                  sampling_strategy: Optional, c, removed_node: int):
    if nodes_to_train_on is not None:
        tr_node_list = nodes_to_train_on[removed_node]
    else:
        tr_node_list = None
        raise ValueError(
            "Training node list is not given, should be given though")
    train_data = save_info.load_list_of_training_data(
        removed_node=removed_node,
        graph=graph.delete_node(removed_node),
        feature_type=feature_type,
        num_of_bins=num_of_bins,
        limit_num=limit_num_training_graphs,
        tr_node_list=tr_node_list)

    utils.assert_df_no_nan(
        train_data, text=f'Training data for removed node {removed_node}')

    test_data = save_info.load_test_data(removed_node=removed_node,
                                         feature_type=feature_type,
                                         num_of_bins=num_of_bins)
    utils.assert_df_no_nan(test_data,
                           text=f'Test data for removed node {removed_node}')

    tr_labels, tr_predicted, tr_probabilities, te_labels, te_predicted, te_probabilities = \
        _train(c, train_data=train_data, test_data=test_data, sampling_strategy=sampling_strategy)

    # train_results, test_results = evaluate(tr_labels, tr_predicted, te_labels, te_predicted, te_probabilities)
    train_results = evaluate(tr_labels, tr_predicted, tr_probabilities)
    test_results = evaluate(te_labels, te_predicted, te_probabilities)

    # add some additional information
    test_results["degree"] = graph.degree(removed_node)

    test_results["avg_neighbour_degree"] = graph.average_neighbour_degree(
        removed_node)

    test_results["avg dist to pos pred"] = \
        calculate_avg_distance_to_positive_predicted_nodes(graph=graph, removed_node=removed_node,
                                                           labels=test_data.index.values,
                                                           predicted=te_predicted)

    test_results["num training features"] = len(train_data)
    test_results["num test features"] = len(test_data)

    test_results["train false negative"] = train_results["false negative"]
    test_results["train true positive"] = train_results["true positive"]
    test_results["train accuracy"] = train_results["accuracy"]
    test_results["train precision"] = train_results["precision"]
    test_results["train recall"] = train_results["recall"]
    test_results["train auc"] = train_results["auc"]

    return pd.Series(test_results), removed_node
def create_node_raking_from_diff_matrix(diff: pd.DataFrame,
                                        removed_nodes: [int],
                                        graph: gc.Graph,
                                        save_info: sl.MemoryAccess,
                                        save: bool = True) -> []:
    utils.assure_same_labels([diff])

    labels = diff.index.values.tolist()
    dim = len(labels)

    # init sums
    node_pos_sums = {}
    node_neg_sums = {}

    for label in labels:
        node_pos_sums[label] = 0
        node_neg_sums[label] = 0

    # sum values up
    for i in range(dim):
        for j in range(i):
            label1 = labels[i]
            label2 = labels[j]

            value = diff.at[label1, label2]
            if value > 0:
                node_pos_sums[label1] += value
                node_pos_sums[label2] += value
            else:
                node_neg_sums[label1] += value
                node_neg_sums[label2] += value

    pos_list = list(map(lambda x: (x, node_pos_sums[x]), node_pos_sums))
    neg_list = list(map(lambda x: (x, node_neg_sums[x]), node_neg_sums))

    complete_list = list(
        map(lambda x: (x, node_pos_sums[x] - node_neg_sums[x]), node_pos_sums))

    pos_list.sort(key=lambda x: -x[1])
    neg_list.sort(key=lambda x: x[1])
    complete_list.sort(key=lambda x: -x[1])

    if save:
        save_info.save_node_raking(removed_nodes, pos_list,
                                   list(graph.neighbours(removed_nodes[-1])))

    neighbours = list(graph.neighbours(removed_nodes[0]))
    pos_list_labels = list(map(lambda x: x[0] in neighbours, pos_list))

    neg_list_labels = list(map(lambda x: x[0] in neighbours, neg_list))
    complete_list_labels = list(
        map(lambda x: x[0] in neighbours, complete_list))

    return pos_list, pos_list_labels, neg_list, neg_list_labels, complete_list, complete_list_labels
def print_results(row_labels, prediction, labels, graph: gc.Graph):
    print("connected")
    for i in range(len(labels)):
        if labels[i] == 1:
            print(row_labels[i], "Predicted: ", prediction[i], "actual:",
                  labels[i], "correct:", prediction[i] == labels[i],
                  "Degree of node:", graph.degree(row_labels[i]))
    print("not connected")
    for i in range(len(labels)):
        if labels[i] == 0:
            print(row_labels[i], "Predicted: ", prediction[i], "actual:",
                  labels[i], "correct:", prediction[i] == labels[i],
                  "Degree of node:", graph.degree(row_labels[i]))
Esempio n. 6
0
def __get_available_sample(graph: gc.Graph, degrees: [int], center,
                           init_range: int, quantity, available_list: [int],
                           neg_list: [int]) -> []:
    assert (set(available_list).issubset(set(graph.nodes())))

    degrees = np.array(degrees)
    candidates = utils.__get_candidates_with_offset(degrees=degrees,
                                                    graph=graph,
                                                    candidate_degree=center,
                                                    neg_list=neg_list)
    offset = 1
    while (offset < init_range) or (len(candidates) < quantity):
        new_candidates = utils.__get_candidates_with_offset(
            degrees=degrees,
            graph=graph,
            candidate_degree=center + offset,
            neg_list=neg_list)
        new_candidates += utils.__get_candidates_with_offset(
            degrees=degrees,
            graph=graph,
            candidate_degree=center - offset,
            neg_list=neg_list)
        candidates += new_candidates
        offset += 1

    # priorities candidates from pref_list
    pref_candidates = list(set(candidates).intersection(set(available_list)))
    if len(pref_candidates) < quantity:
        raise ValueError(
            f"Not all nodes available for sampling nodes with about {center} degrees. Grapg {str(graph)}"
        )

    return pref_candidates[:quantity]
Esempio n. 7
0
 def load_embedding(self,
                    graph: Graph,
                    removed_nodes: [int],
                    save_info: sl.MemoryAccess,
                    iteration: int,
                    load_neg_results: bool = False):
     target = save_info.get_embedding_name(removed_nodes=removed_nodes,
                                           iteration=iteration)
     target_name = os.path.abspath(target + ".emb")
     target_name_neg = os.path.abspath(target + "_neg.emb")
     if load_neg_results:
         return load_results(target_name=target_name,
                             node_names=graph.nodes()), load_results(
                                 target_name=target_name_neg,
                                 node_names=graph.nodes())
     else:
         return load_results(target_name=target_name,
                             node_names=graph.nodes())
Esempio n. 8
0
    def load_list_of_training_data(self,
                                   removed_node: int,
                                   feature_type: ft.FeatureType,
                                   num_of_bins: int,
                                   graph: gc.Graph,
                                   tr_node_list: [int] = None,
                                   all_data_available: bool = False,
                                   limit_num: int = None) -> pd.DataFrame:
        training_data = pd.DataFrame()

        if tr_node_list is not None:
            available_graph_data = tr_node_list
            if limit_num is not None and len(
                    available_graph_data) != limit_num:
                raise ValueError(
                    f"The given training data does not match the number of requrired training data. \n "
                    f"Given tr nodes {available_graph_data}, "
                    f"should be {limit_num} but are {len(available_graph_data)}"
                )
        elif all_data_available:
            available_graph_data = graph.nodes()
        else:
            available_graph_data = self.get_list_of_available_training_data(
                feature_type=feature_type,
                num_of_bins=num_of_bins,
                graph=graph,
                removed_first_node=removed_node)
            if limit_num is not None:
                if len(available_graph_data) < limit_num:
                    raise ValueError(
                        f"numer of avialable graph data is smaller than the limit. \n "
                        f"Num available graphs {available_graph_data}, limit_num {limit_num} "
                    )
                available_graph_data = np.random.choice(available_graph_data,
                                                        limit_num,
                                                        replace=False)

        for other_node in available_graph_data:
            if other_node != removed_node:
                data = self.load_training_data(
                    removed_nodes=[removed_node, other_node],
                    feature_type=feature_type,
                    num_of_bins=num_of_bins)
                utils.assert_df_no_nan(
                    data, text=f"removed nodes [{removed_node}, {other_node}]")

                training_data = training_data.append(data)
                utils.assert_df_no_nan(
                    training_data,
                    text=
                    f"aggregated training data after appending removed nodes"
                    f" [{removed_node}, {other_node}]")

        utils.assert_df_no_nan(
            training_data,
            text=f"aggregated training data fro removed node {removed_node}")
        return training_data
Esempio n. 9
0
    def access_vocab(self,
                     graph: gc.Graph,
                     removed_nodes: [int] = None,
                     graph_description: str = None):
        if removed_nodes is None:
            removed_nodes = []

        assert (all(node not in graph.nodes() for node in removed_nodes))
        file_name = self.__get_graph_name(removed_nodes,
                                          graph_description) + ".vocab"

        if not os.path.exists(file_name):
            # create edge list file
            nodes = "\n".join(map(lambda node: str(node) + " 0",
                                  graph.nodes()))
            with open(file_name, "w+") as file:
                file.write(nodes)

        return file_name
def test_all_sampling_strats(save_info: sl.MemoryAccess, graph: gc.Graph,
                             feature_type: ft.FeatureType, num_of_bins: int):
    # test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_of_bins)
    for strat in SamplingStrategy:
        test(save_info=save_info,
             graph=graph,
             feature_type=feature_type,
             num_of_bins=num_of_bins,
             list_nodes_to_predict=graph.nodes(),
             sampling_strategy=strat)
def compute_degrees(graph: gc.Graph, labels: [int]):
    degrees = pd.DataFrame(0, labels, ["degree"])
    for label in labels:
        degrees.loc[label] = graph.degree(label)

    # normalise degrees
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(degrees.astype(float))

    return pd.DataFrame(x_scaled, index=labels, columns=["degree"])
def calculate_avg_distance_to_positive_predicted_nodes(graph: gc.Graph,
                                                       removed_node: int,
                                                       labels: [int],
                                                       predicted: [int]):
    pos_labels = labels[predicted == 1]
    if len(pos_labels) > 0:
        return float(
            sum(map(lambda x: graph.distance(removed_node, x),
                    pos_labels))) / len(pos_labels)
    else:
        print(f"no node was predicted to be connected to {removed_node}")
        return 0
Esempio n. 13
0
    def get_list_of_available_training_data(self,
                                            feature_type: ft.FeatureType,
                                            num_of_bins: int,
                                            graph: gc.Graph,
                                            removed_first_node: int = None):
        files = []

        if removed_first_node is not None:
            removed_nodes = [removed_first_node]
        else:
            removed_nodes = []

        for node in graph.nodes():
            if self.has_training_data(removed_nodes=removed_nodes + [node],
                                      feature_type=feature_type,
                                      num_of_bins=num_of_bins):
                files.append(node)

        assert (all([node in graph.nodes() for node in files]))

        return files
def __create_degree_column_for_feature(graph: gc.Graph, row_labels: [int]):
    degrees = pd.DataFrame(0, row_labels, ["degree"])
    for label in row_labels:
        degrees.loc[label] = graph.degree(label)

    # normalise degrees
    # min_max_scaler = preprocessing.MinMaxScaler()
    # x_scaled = min_max_scaler.fit_transform(degrees)

    degrees = degrees / max(degrees.values)

    return pd.DataFrame(degrees, index=row_labels, columns=["degree"])
Esempio n. 15
0
    def access_line_edge_list(self,
                              graph: gc.Graph,
                              removed_nodes: [int] = None):
        if removed_nodes is None:
            removed_nodes = None

        assert (graph.name() == self.graph)

        file_name = self.__get_graph_name(
            removed_nodes) + ".directedweihtededgelist"

        if not os.path.exists(file_name):
            # create edge list
            edges = "\n".join(
                list(
                    map(
                        lambda edge:
                        f"{str(edge[0])} {str(edge[1])} 1\n {str(edge[1])} {str(edge[0])} 1",
                        graph.edges())))
            with open(file_name, "w+") as file:
                file.write(edges)
        return file_name
Esempio n. 16
0
    def get_list_of_available_difference_matrices(
            self, graph: gc.Graph, removed_first_node: int = None):
        files = []

        if removed_first_node is not None:
            removed_nodes = [removed_first_node]
        else:
            removed_nodes = []

        for node in graph.nodes():
            if self.has_diff_matrix(removed_nodes=removed_nodes + [node]):
                files.append(node)

        return files
Esempio n. 17
0
def sample_low_avg_high_degree_nodes(graph: gc.Graph, quantity: int, init_range: int = 2, pref_list=None):
    if pref_list is None:
        pref_list = []
    degrees = graph.all_degrees()

    min_val: int = min(degrees)
    max_val: int = max(degrees)
    avg_val: int = int(round(((max_val - min_val) / 2) + min_val))  # int(round(np.array(degrees).mean()))

    nodes = graph.nodes()

    max_sample = __get_sample(graph=graph, degrees=degrees, center=max_val, init_range=init_range, quantity=quantity,
                              pref_list=pref_list, neg_list=[])
    min_sample = __get_sample(graph=graph, degrees=degrees, center=min_val, init_range=init_range, quantity=quantity,
                              pref_list=pref_list, neg_list=list(max_sample))
    avg_sample = __get_sample(graph=graph, degrees=degrees, center=avg_val, init_range=init_range, quantity=quantity,
                              pref_list=pref_list, neg_list=list(max_sample) + list(min_sample))

    # print(f"samles: \n    max {max_sample}\n    min: {min_sample}\n    avg: {avg_sample}")
    samples = np.concatenate((max_sample, avg_sample, min_sample))

    assert (len(set(samples)) == len(samples))

    return samples
Esempio n. 18
0
def __get_sample(graph: gc.Graph, degrees: [int], center, init_range: int, quantity, pref_list: [int],
                 neg_list: [int]) -> np.ndarray:
    assert (set(pref_list).issubset(set(graph.nodes())))

    degrees = np.array(degrees)
    candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center, neg_list=neg_list)
    offset = 1
    while (offset < init_range) or (len(candidates) < quantity):
        new_candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center + offset,
                                                      neg_list=neg_list)
        new_candidates += __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center - offset,
                                                       neg_list=neg_list)
        candidates += new_candidates
        offset += 1

    # priorities candidates from pref_list
    pref_candidates = list(set(candidates).intersection(set(pref_list)))
    return sample_randomly_with_preferred_list(pref_list=pref_candidates, all_list=candidates, quantity=quantity)
Esempio n. 19
0
def get_min_avg_max_sample_from_available_list(
    graph: gc.Graph,
    quantity: int,
    available_list: [],
    init_range: int = 2,
):
    degrees = graph.all_degrees()

    min_val: int = min(degrees)
    max_val: int = max(degrees)
    avg_val: int = int(round(((max_val - min_val) / 2) +
                             min_val))  # int(round(np.array(degrees).mean()))

    max_sample = __get_available_sample(graph=graph,
                                        degrees=degrees,
                                        center=max_val,
                                        init_range=init_range,
                                        quantity=quantity,
                                        available_list=available_list,
                                        neg_list=[])
    min_sample = __get_available_sample(graph=graph,
                                        degrees=degrees,
                                        center=min_val,
                                        init_range=init_range,
                                        quantity=quantity,
                                        available_list=available_list,
                                        neg_list=list(max_sample))
    avg_sample = __get_available_sample(graph=graph,
                                        degrees=degrees,
                                        center=avg_val,
                                        init_range=init_range,
                                        quantity=quantity,
                                        available_list=available_list,
                                        neg_list=list(max_sample) +
                                        list(min_sample))

    # print(f"samles: \n    max {max_sample}\n    min: {min_sample}\n    avg: {avg_sample}")
    samples = np.concatenate((max_sample, avg_sample, min_sample))

    assert (len(set(samples)) == len(samples))

    return samples
def create_target_vector(row_labels: [], graph: gc.Graph,
                         node_to_predict: int) -> pd.DataFrame:
    """
    creates the target vector for classifier
    :param row_labels: labels the target vector should be created
    :param graph: graph including the removed node
    :param node_to_predict: the node that is removed in the 2. embedding
    :return:
    """

    neighbours_of_removed_node = graph.neighbours(node_to_predict)

    target = pd.DataFrame(False, row_labels, ["y"])

    for neighbour in neighbours_of_removed_node:

        # this prevents an error in case the original graph is used while 2 nodes are removed in the labels
        # and they are connected
        if neighbour in row_labels:
            target.loc[neighbour] = True

    return target
Esempio n. 21
0
    def get_list_of_available_embeddings(self,
                                         graph: gc.Graph,
                                         removed_first_node: int = None,
                                         emb_description: str = None,
                                         find_started_trainings: bool = False):
        files = []
        if find_started_trainings:
            iteration = 0
        else:
            iteration = self.num_iterations - 1

        if removed_first_node is not None:
            removed_nodes = [removed_first_node]
        else:
            removed_nodes = []

        for node in graph.nodes():
            if self.has_embedding(removed_nodes=removed_nodes + [node],
                                  iteration=iteration,
                                  emb_description=emb_description):
                files.append(node)

        return files
Esempio n. 22
0
def get_available_graph_data(graph: gc.Graph, save_info: sl.MemoryAccess,
                             num_of_training_graphs: int):
    complete_data = {}

    te_nodes = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=False)

    for te_node in te_nodes:
        graph_removed_one = graph.delete_node(te_node)

        second_completed_embeddings = save_info.get_list_of_available_embeddings(
            graph=graph_removed_one,
            removed_first_node=te_node,
            find_started_trainings=False)
        second_completed_embeddings = filter_by_splitting_nodes(
            tr_nodes=second_completed_embeddings,
            graph_rem_one=graph_removed_one)

        if len(second_completed_embeddings) >= num_of_training_graphs:
            complete_data[
                te_node] = second_completed_embeddings[:num_of_training_graphs]
            # np.random.choice(a=second_completed_embeddings, size=num_of_training_graphs,replace=False)

    return complete_data
Esempio n. 23
0
def get_sample_with_degree(graph: gc.Graph, node_list: [int], degree: int, quantity: int):
    degrees = np.array([graph.degree(n) for n in node_list])
    samples_to_find = quantity

    candidates = np.where(degrees == degree)[0]
    if len(candidates) > samples_to_find:
        np.random.choice(candidates, size=samples_to_find, replace=False)
    elif len(candidates) < samples_to_find:
        raise ValueError(f'Not enough training samples required {quantity} got {len(candidates)}')
    sample = candidates
    samples_to_find -= len(candidates)

    offset = 1
    while samples_to_find > 0:
        # print(f"Sampling range {offset}")
        candidates = np.concatenate([np.where(degrees == (degree + offset))[0],
                                     np.where(degrees == (degree - offset))[0]])
        if len(candidates) > samples_to_find:
            candidates = np.random.choice(candidates, size=samples_to_find, replace=False)
        sample = np.concatenate([sample, candidates])
        samples_to_find -= len(candidates)
        offset += 1

    return [node_list[s] for s in sample]
Esempio n. 24
0
def __filter_splitting_nodes(node_list: List[int], graph: gc.Graph):
    return list(filter(lambda x: not graph.splits_graph(x), node_list))
Esempio n. 25
0
def filter_by_splitting_nodes(tr_nodes: [], graph_rem_one: gc.Graph):
    return list(
        filter(lambda node: not graph_rem_one.splits_graph(node), tr_nodes))
Esempio n. 26
0
def __get_graph_degree_properties(graph: gc.Graph):
    degrees = graph.all_degrees()
    min_val = min(degrees)
    max_val = max(degrees)
    avg = np.mean(degrees)
    return min_val, max_val, avg
Esempio n. 27
0
def __compute_training_features_for_one_node(dm_original: pd.DataFrame,
                                             node_to_predict: int,
                                             save_info: sl.MemoryAccess,
                                             graph: gc.Graph, num_of_bins: int,
                                             feature_type: ft.FeatureType,
                                             nodes_to_train_on: [int]) -> None:
    """
    :param dm_original: distance matrix of the original graph
    :param node_to_predict: node that is removed from the graph and should be predicted
    :param save_info: data access object
    :param graph: graph the embedding is trained on
    :param num_of_bins: number of bins that should be used to generate training features
    :param feature_type: type of the feature vector that is used
    :param nodes_to_train_on: a list of nodes that are removed from the graph after removing
            node_to_predict to generate training data
    """

    # --- compute test features for node_to_predict ---
    # remove node_to_predict from the graph
    graph_reduced = graph.delete_node(node_to_predict)
    dm_reduced = calc_avg_distance_matrix(graph=graph_reduced,
                                          removed_nodes=[node_to_predict],
                                          save_info=save_info)

    # test if training data is already avialable
    if save_info.has_training_data([node_to_predict],
                                   feature_type=feature_type,
                                   num_of_bins=num_of_bins):
        # print("Training Feature for removed nodes ", [node_to_predict], " and feature type ",
        #     "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained")
        pass
    else:
        # print(f"Compute test features for node {node_to_predict}")
        diff = cf.create_difference_matrix(dm_original,
                                           dm_reduced,
                                           removed_nodes=[node_to_predict],
                                           save_info=save_info)

        # compute training data
        # cf.create_feature_from_diff_bins_with_dim(diff=diff, removed_nodes=[node_to_predict], original_graph=graph,
        #                                          num_of_bins=num_of_bins, save_info=save_info)
        cf.create_features(diff=diff,
                           removed_nodes=[node_to_predict],
                           original_graph=graph,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)

        del diff  # free RAM
        # save_info.remove_diff_matrix(removed_nodes=[node_to_predict])  # free memory

    # --- compute training features for nodes_to_train_on ---
    # print(f"Create training features for removed node {node_to_predict} by by removing ", nodes_to_train_on)
    for node in nodes_to_train_on:

        # check if features already exists
        if save_info.has_training_data(removed_nodes=[node_to_predict, node],
                                       feature_type=feature_type,
                                       num_of_bins=num_of_bins):
            # print("Training Feature for removed nodes ", [node_to_predict, node], " and feature type ",
            #     "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained")
            pass
        else:
            graph_reduced_2 = graph_reduced.delete_node(node)
            dm_reduced_2 = calc_avg_distance_matrix(
                graph=graph_reduced_2,
                removed_nodes=[node_to_predict, node],
                save_info=save_info)
            print("odm", type(dm_reduced), "rdm", type(dm_reduced_2))
            diff_reduced = cf.create_difference_matrix(
                dm_reduced,
                dm_reduced_2,
                removed_nodes=[node_to_predict, node],
                save_info=save_info)

            print("rdiff", type(diff_reduced), "odm", type(dm_reduced), "rdm",
                  type(dm_reduced_2))
            del dm_reduced_2
            # compute training data

            cf.create_features(diff=diff_reduced,
                               removed_nodes=[node_to_predict, node],
                               original_graph=graph_reduced,
                               num_of_bins=num_of_bins,
                               save_info=save_info,
                               feature_type=feature_type)
Esempio n. 28
0
def train_embedding_per_graph(
        graph: gc.Graph,
        embedding: Embedding,
        save_info: sl.MemoryAccess,
        num_of_embeddings: int = 30,
        num_of_test_evaluations_per_degree_level: int = 5,
        num_of_training_graphs: int = 10,
        num_of_bins_for_tf: [int] = None,
        run_experiments_on_embedding: bool = True,
        feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM):
    assert (num_of_embeddings == save_info.get_num_iterations())
    if num_of_bins_for_tf is None:
        num_of_bins_for_tf = [10]
    elif isinstance(num_of_bins_for_tf, int):
        num_of_bins_for_tf = [num_of_bins_for_tf]

    embedding.train_embedding(graph=graph,
                              save_info=save_info,
                              removed_nodes=[],
                              num_of_embeddings=num_of_embeddings)

    first_started_embedding = save_info.get_list_of_available_embeddings(
        graph=graph, find_started_trainings=True)

    tested_nodes = utils.sample_low_avg_high_degree_nodes(
        graph=graph,
        quantity=num_of_test_evaluations_per_degree_level,
        init_range=2,
        pref_list=first_started_embedding)
    print(f"\nTrain Embeddings for nodes {tested_nodes}")
    nodes_for_training_embedding = {}

    for index, first_node in enumerate(tested_nodes):
        # print(f"Start training embedding for {index}({first_node}). node.")
        graph_removed_one = graph.delete_node(first_node)
        embedding.train_embedding(graph=graph_removed_one,
                                  save_info=save_info,
                                  removed_nodes=[first_node],
                                  num_of_embeddings=num_of_embeddings)

        if num_of_training_graphs:

            second_completed_diffs = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=False)

            second_started_embedding = save_info.get_list_of_available_embeddings(
                graph=graph_removed_one,
                removed_first_node=first_node,
                find_started_trainings=True)

            second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes(
                graph=graph_removed_one,
                pref_list=second_completed_diffs,
                secondary_pref_list=second_started_embedding,
                all_list=graph_removed_one.nodes(),
                quantity=num_of_training_graphs)
        else:
            second_tested_nodes = graph_removed_one.nodes()

        nodes_for_training_embedding[first_node] = second_tested_nodes

        # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}")
        for index2, second_node in enumerate(second_tested_nodes):
            # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.")
            graph_removed_two = graph_removed_one.delete_node(second_node)
            embedding.train_embedding(graph=graph_removed_two,
                                      save_info=save_info,
                                      removed_nodes=[first_node, second_node],
                                      num_of_embeddings=num_of_embeddings)

    # create features
    if run_experiments_on_embedding:

        for num_bins in num_of_bins_for_tf:
            # try:
            cf.compute_training_features(
                save_info=save_info,
                graph=graph,
                num_of_bins=num_bins,
                list_nodes_to_predict=tested_nodes,
                nodes_to_train_on=nodes_for_training_embedding,
                feature_type=feature_type)
            te.test(save_info=save_info,
                    graph=graph,
                    feature_type=feature_type,
                    num_of_bins=num_bins,
                    limit_num_training_graphs=num_of_training_graphs,
                    list_nodes_to_predict=tested_nodes,
                    nodes_to_train_on=nodes_for_training_embedding)
            # except Exception as e:
            #  print(f"Failed to compute Training Features or Test. "
            #          f"graph {str(graph)}, "
            #          f"emb {str(embedding)}, "
            #          f"num_bins {num_bins}")
            #   traceback.print_exc()

    return tested_nodes, nodes_for_training_embedding
Esempio n. 29
0
def compute_training_features_for_one_node_pool(
        save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int,
        feature_type: ft.FeatureType, nodes_to_train_on: {},
        o_dm_list: [pd.DataFrame], node_to_predict: int):
    '''
    Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph
    :param save_info:
    :param graph:
    :param num_of_bins:
    :param feature_type:
    :param nodes_to_train_on:
    :param node_to_predict:
    :return:
    '''

    num_iter = save_info.get_num_iterations()

    quantity_dict = {
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS:
        [num_iter, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter],
        dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE:
        [1, num_iter, num_iter]
    }

    quantity = quantity_dict[save_info.get_diff_type()]

    used_emb = save_info.get_diff_type().get_iter()

    # compute attack features
    diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict],
                                             save_info=save_info,
                                             quantity_first=quantity[0],
                                             quantity_second=quantity[1],
                                             used_emb=used_emb,
                                             o_dm_list=o_dm_list)
    cf.create_features(diff=diff,
                       removed_nodes=[node_to_predict],
                       original_graph=graph,
                       num_of_bins=num_of_bins,
                       feature_type=feature_type,
                       save_info=save_info)

    # compute training features
    if save_info.is_diff_type(
            dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE):
        # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'')
        o_dm_list_t = [min_r_dm]
        quantity[1] = 1
    else:
        o_dm_list_t = None

    g_prime = graph.delete_node(removed_node=node_to_predict)
    for tr_node in nodes_to_train_on[node_to_predict]:
        removed_nodes = [node_to_predict, tr_node]

        diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes,
                                          save_info=save_info,
                                          quantity_first=quantity[1],
                                          quantity_second=quantity[2],
                                          used_emb=used_emb,
                                          o_dm_list=o_dm_list_t)
        cf.create_features(diff=diff,
                           removed_nodes=removed_nodes,
                           original_graph=g_prime,
                           num_of_bins=num_of_bins,
                           feature_type=feature_type,
                           save_info=save_info)
def _get_avg_degree_of_neighbours(graph: gc.Graph, node: int):
    neighbours_mean = (list(
        map(lambda n: graph.degree(n), list(graph.neighbours(node)))))
    return sum(neighbours_mean) / len(neighbours_mean)