def test_per_node(nodes_to_train_on: List[int], graph: gc.Graph, save_info: sl.MemoryAccess, feature_type: ft.FeatureType, num_of_bins: int, limit_num_training_graphs: Optional[int], sampling_strategy: Optional, c, removed_node: int): if nodes_to_train_on is not None: tr_node_list = nodes_to_train_on[removed_node] else: tr_node_list = None raise ValueError( "Training node list is not given, should be given though") train_data = save_info.load_list_of_training_data( removed_node=removed_node, graph=graph.delete_node(removed_node), feature_type=feature_type, num_of_bins=num_of_bins, limit_num=limit_num_training_graphs, tr_node_list=tr_node_list) utils.assert_df_no_nan( train_data, text=f'Training data for removed node {removed_node}') test_data = save_info.load_test_data(removed_node=removed_node, feature_type=feature_type, num_of_bins=num_of_bins) utils.assert_df_no_nan(test_data, text=f'Test data for removed node {removed_node}') tr_labels, tr_predicted, tr_probabilities, te_labels, te_predicted, te_probabilities = \ _train(c, train_data=train_data, test_data=test_data, sampling_strategy=sampling_strategy) # train_results, test_results = evaluate(tr_labels, tr_predicted, te_labels, te_predicted, te_probabilities) train_results = evaluate(tr_labels, tr_predicted, tr_probabilities) test_results = evaluate(te_labels, te_predicted, te_probabilities) # add some additional information test_results["degree"] = graph.degree(removed_node) test_results["avg_neighbour_degree"] = graph.average_neighbour_degree( removed_node) test_results["avg dist to pos pred"] = \ calculate_avg_distance_to_positive_predicted_nodes(graph=graph, removed_node=removed_node, labels=test_data.index.values, predicted=te_predicted) test_results["num training features"] = len(train_data) test_results["num test features"] = len(test_data) test_results["train false negative"] = train_results["false negative"] test_results["train true positive"] = train_results["true positive"] test_results["train accuracy"] = train_results["accuracy"] test_results["train precision"] = train_results["precision"] test_results["train recall"] = train_results["recall"] test_results["train auc"] = train_results["auc"] return pd.Series(test_results), removed_node
def get_available_graph_data(graph: gc.Graph, save_info: sl.MemoryAccess, num_of_training_graphs: int): complete_data = {} te_nodes = save_info.get_list_of_available_embeddings( graph=graph, find_started_trainings=False) for te_node in te_nodes: graph_removed_one = graph.delete_node(te_node) second_completed_embeddings = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=te_node, find_started_trainings=False) second_completed_embeddings = filter_by_splitting_nodes( tr_nodes=second_completed_embeddings, graph_rem_one=graph_removed_one) if len(second_completed_embeddings) >= num_of_training_graphs: complete_data[ te_node] = second_completed_embeddings[:num_of_training_graphs] # np.random.choice(a=second_completed_embeddings, size=num_of_training_graphs,replace=False) return complete_data
def __compute_training_features_for_one_node(dm_original: pd.DataFrame, node_to_predict: int, save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType, nodes_to_train_on: [int]) -> None: """ :param dm_original: distance matrix of the original graph :param node_to_predict: node that is removed from the graph and should be predicted :param save_info: data access object :param graph: graph the embedding is trained on :param num_of_bins: number of bins that should be used to generate training features :param feature_type: type of the feature vector that is used :param nodes_to_train_on: a list of nodes that are removed from the graph after removing node_to_predict to generate training data """ # --- compute test features for node_to_predict --- # remove node_to_predict from the graph graph_reduced = graph.delete_node(node_to_predict) dm_reduced = calc_avg_distance_matrix(graph=graph_reduced, removed_nodes=[node_to_predict], save_info=save_info) # test if training data is already avialable if save_info.has_training_data([node_to_predict], feature_type=feature_type, num_of_bins=num_of_bins): # print("Training Feature for removed nodes ", [node_to_predict], " and feature type ", # "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained") pass else: # print(f"Compute test features for node {node_to_predict}") diff = cf.create_difference_matrix(dm_original, dm_reduced, removed_nodes=[node_to_predict], save_info=save_info) # compute training data # cf.create_feature_from_diff_bins_with_dim(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, # num_of_bins=num_of_bins, save_info=save_info) cf.create_features(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info) del diff # free RAM # save_info.remove_diff_matrix(removed_nodes=[node_to_predict]) # free memory # --- compute training features for nodes_to_train_on --- # print(f"Create training features for removed node {node_to_predict} by by removing ", nodes_to_train_on) for node in nodes_to_train_on: # check if features already exists if save_info.has_training_data(removed_nodes=[node_to_predict, node], feature_type=feature_type, num_of_bins=num_of_bins): # print("Training Feature for removed nodes ", [node_to_predict, node], " and feature type ", # "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained") pass else: graph_reduced_2 = graph_reduced.delete_node(node) dm_reduced_2 = calc_avg_distance_matrix( graph=graph_reduced_2, removed_nodes=[node_to_predict, node], save_info=save_info) print("odm", type(dm_reduced), "rdm", type(dm_reduced_2)) diff_reduced = cf.create_difference_matrix( dm_reduced, dm_reduced_2, removed_nodes=[node_to_predict, node], save_info=save_info) print("rdiff", type(diff_reduced), "odm", type(dm_reduced), "rdm", type(dm_reduced_2)) del dm_reduced_2 # compute training data cf.create_features(diff=diff_reduced, removed_nodes=[node_to_predict, node], original_graph=graph_reduced, num_of_bins=num_of_bins, save_info=save_info, feature_type=feature_type)
def compute_training_features_for_one_node_pool( save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType, nodes_to_train_on: {}, o_dm_list: [pd.DataFrame], node_to_predict: int): ''' Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph :param save_info: :param graph: :param num_of_bins: :param feature_type: :param nodes_to_train_on: :param node_to_predict: :return: ''' num_iter = save_info.get_num_iterations() quantity_dict = { dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS: [num_iter, num_iter, num_iter], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE: [1, num_iter, num_iter] } quantity = quantity_dict[save_info.get_diff_type()] used_emb = save_info.get_diff_type().get_iter() # compute attack features diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict], save_info=save_info, quantity_first=quantity[0], quantity_second=quantity[1], used_emb=used_emb, o_dm_list=o_dm_list) cf.create_features(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info) # compute training features if save_info.is_diff_type( dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE): # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'') o_dm_list_t = [min_r_dm] quantity[1] = 1 else: o_dm_list_t = None g_prime = graph.delete_node(removed_node=node_to_predict) for tr_node in nodes_to_train_on[node_to_predict]: removed_nodes = [node_to_predict, tr_node] diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes, save_info=save_info, quantity_first=quantity[1], quantity_second=quantity[2], used_emb=used_emb, o_dm_list=o_dm_list_t) cf.create_features(diff=diff, removed_nodes=removed_nodes, original_graph=g_prime, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info)
def train_embedding_per_graph( graph: gc.Graph, embedding: Embedding, save_info: sl.MemoryAccess, num_of_embeddings: int = 30, num_of_test_evaluations_per_degree_level: int = 5, num_of_training_graphs: int = 10, num_of_bins_for_tf: [int] = None, run_experiments_on_embedding: bool = True, feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM): assert (num_of_embeddings == save_info.get_num_iterations()) if num_of_bins_for_tf is None: num_of_bins_for_tf = [10] elif isinstance(num_of_bins_for_tf, int): num_of_bins_for_tf = [num_of_bins_for_tf] embedding.train_embedding(graph=graph, save_info=save_info, removed_nodes=[], num_of_embeddings=num_of_embeddings) first_started_embedding = save_info.get_list_of_available_embeddings( graph=graph, find_started_trainings=True) tested_nodes = utils.sample_low_avg_high_degree_nodes( graph=graph, quantity=num_of_test_evaluations_per_degree_level, init_range=2, pref_list=first_started_embedding) print(f"\nTrain Embeddings for nodes {tested_nodes}") nodes_for_training_embedding = {} for index, first_node in enumerate(tested_nodes): # print(f"Start training embedding for {index}({first_node}). node.") graph_removed_one = graph.delete_node(first_node) embedding.train_embedding(graph=graph_removed_one, save_info=save_info, removed_nodes=[first_node], num_of_embeddings=num_of_embeddings) if num_of_training_graphs: second_completed_diffs = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=first_node, find_started_trainings=False) second_started_embedding = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=first_node, find_started_trainings=True) second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes( graph=graph_removed_one, pref_list=second_completed_diffs, secondary_pref_list=second_started_embedding, all_list=graph_removed_one.nodes(), quantity=num_of_training_graphs) else: second_tested_nodes = graph_removed_one.nodes() nodes_for_training_embedding[first_node] = second_tested_nodes # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}") for index2, second_node in enumerate(second_tested_nodes): # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.") graph_removed_two = graph_removed_one.delete_node(second_node) embedding.train_embedding(graph=graph_removed_two, save_info=save_info, removed_nodes=[first_node, second_node], num_of_embeddings=num_of_embeddings) # create features if run_experiments_on_embedding: for num_bins in num_of_bins_for_tf: # try: cf.compute_training_features( save_info=save_info, graph=graph, num_of_bins=num_bins, list_nodes_to_predict=tested_nodes, nodes_to_train_on=nodes_for_training_embedding, feature_type=feature_type) te.test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_bins, limit_num_training_graphs=num_of_training_graphs, list_nodes_to_predict=tested_nodes, nodes_to_train_on=nodes_for_training_embedding) # except Exception as e: # print(f"Failed to compute Training Features or Test. " # f"graph {str(graph)}, " # f"emb {str(embedding)}, " # f"num_bins {num_bins}") # traceback.print_exc() return tested_nodes, nodes_for_training_embedding