def calc_avg_distance_matrix(graph: gc.Graph, removed_nodes: [int], save_info: sl.MemoryAccess): if save_info.has_avg_distance_matrix(removed_nodes=removed_nodes): save_info.delete_distance_matrices(removed_nodes=removed_nodes) return save_info.load_avg_distance_matrix(removed_nodes) used_embeddings = range(save_info.get_num_iterations()) avg_dm = pd.DataFrame(0.0, index=graph.nodes(), columns=graph.nodes()) dm_calc_func = functools.partial(__calc_dm, graph, removed_nodes, save_info) for iter in used_embeddings: res = dm_calc_func(iter) i, dm = res utils.assure_same_labels([avg_dm, dm], "Format of distance matrix iteration {} \ for removed nodes {} is not correct".format( i, removed_nodes)) avg_dm += dm avg_dm = avg_dm.div(len(used_embeddings)) # save avg distance matrix save_info.save_avg_distance_matrix(removed_nodes, avg_dm) # delete dms for memory space save_info.delete_distance_matrices(removed_nodes=removed_nodes) return avg_dm
def train_embedding(self, graph: gc.Graph, save_info: sl.MemoryAccess, removed_nodes: [int], num_of_embeddings: int): super().train_embedding(graph=graph, save_info=save_info, removed_nodes=removed_nodes, num_of_embeddings=num_of_embeddings) nx_g = graph.to_networkx() nx_g.to_directed() np.testing.assert_array_equal(nx_g.nodes(), graph.nodes()) nx_g = nx.convert_node_labels_to_integers(nx_g) for iter in range(num_of_embeddings): if save_info.has_embedding(removed_nodes=removed_nodes, iteration=iter): continue Y, t = self.__gem_embedding.learn_embedding(graph=nx_g, is_weighted=False, no_python=True) emb = pd.DataFrame(Y, index=graph.nodes()) save_info.save_embedding(removed_nodes=removed_nodes, iteration=iter, embedding=emb)
def test_per_node(nodes_to_train_on: List[int], graph: gc.Graph, save_info: sl.MemoryAccess, feature_type: ft.FeatureType, num_of_bins: int, limit_num_training_graphs: Optional[int], sampling_strategy: Optional, c, removed_node: int): if nodes_to_train_on is not None: tr_node_list = nodes_to_train_on[removed_node] else: tr_node_list = None raise ValueError( "Training node list is not given, should be given though") train_data = save_info.load_list_of_training_data( removed_node=removed_node, graph=graph.delete_node(removed_node), feature_type=feature_type, num_of_bins=num_of_bins, limit_num=limit_num_training_graphs, tr_node_list=tr_node_list) utils.assert_df_no_nan( train_data, text=f'Training data for removed node {removed_node}') test_data = save_info.load_test_data(removed_node=removed_node, feature_type=feature_type, num_of_bins=num_of_bins) utils.assert_df_no_nan(test_data, text=f'Test data for removed node {removed_node}') tr_labels, tr_predicted, tr_probabilities, te_labels, te_predicted, te_probabilities = \ _train(c, train_data=train_data, test_data=test_data, sampling_strategy=sampling_strategy) # train_results, test_results = evaluate(tr_labels, tr_predicted, te_labels, te_predicted, te_probabilities) train_results = evaluate(tr_labels, tr_predicted, tr_probabilities) test_results = evaluate(te_labels, te_predicted, te_probabilities) # add some additional information test_results["degree"] = graph.degree(removed_node) test_results["avg_neighbour_degree"] = graph.average_neighbour_degree( removed_node) test_results["avg dist to pos pred"] = \ calculate_avg_distance_to_positive_predicted_nodes(graph=graph, removed_node=removed_node, labels=test_data.index.values, predicted=te_predicted) test_results["num training features"] = len(train_data) test_results["num test features"] = len(test_data) test_results["train false negative"] = train_results["false negative"] test_results["train true positive"] = train_results["true positive"] test_results["train accuracy"] = train_results["accuracy"] test_results["train precision"] = train_results["precision"] test_results["train recall"] = train_results["recall"] test_results["train auc"] = train_results["auc"] return pd.Series(test_results), removed_node
def create_node_raking_from_diff_matrix(diff: pd.DataFrame, removed_nodes: [int], graph: gc.Graph, save_info: sl.MemoryAccess, save: bool = True) -> []: utils.assure_same_labels([diff]) labels = diff.index.values.tolist() dim = len(labels) # init sums node_pos_sums = {} node_neg_sums = {} for label in labels: node_pos_sums[label] = 0 node_neg_sums[label] = 0 # sum values up for i in range(dim): for j in range(i): label1 = labels[i] label2 = labels[j] value = diff.at[label1, label2] if value > 0: node_pos_sums[label1] += value node_pos_sums[label2] += value else: node_neg_sums[label1] += value node_neg_sums[label2] += value pos_list = list(map(lambda x: (x, node_pos_sums[x]), node_pos_sums)) neg_list = list(map(lambda x: (x, node_neg_sums[x]), node_neg_sums)) complete_list = list( map(lambda x: (x, node_pos_sums[x] - node_neg_sums[x]), node_pos_sums)) pos_list.sort(key=lambda x: -x[1]) neg_list.sort(key=lambda x: x[1]) complete_list.sort(key=lambda x: -x[1]) if save: save_info.save_node_raking(removed_nodes, pos_list, list(graph.neighbours(removed_nodes[-1]))) neighbours = list(graph.neighbours(removed_nodes[0])) pos_list_labels = list(map(lambda x: x[0] in neighbours, pos_list)) neg_list_labels = list(map(lambda x: x[0] in neighbours, neg_list)) complete_list_labels = list( map(lambda x: x[0] in neighbours, complete_list)) return pos_list, pos_list_labels, neg_list, neg_list_labels, complete_list, complete_list_labels
def print_results(row_labels, prediction, labels, graph: gc.Graph): print("connected") for i in range(len(labels)): if labels[i] == 1: print(row_labels[i], "Predicted: ", prediction[i], "actual:", labels[i], "correct:", prediction[i] == labels[i], "Degree of node:", graph.degree(row_labels[i])) print("not connected") for i in range(len(labels)): if labels[i] == 0: print(row_labels[i], "Predicted: ", prediction[i], "actual:", labels[i], "correct:", prediction[i] == labels[i], "Degree of node:", graph.degree(row_labels[i]))
def __get_available_sample(graph: gc.Graph, degrees: [int], center, init_range: int, quantity, available_list: [int], neg_list: [int]) -> []: assert (set(available_list).issubset(set(graph.nodes()))) degrees = np.array(degrees) candidates = utils.__get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center, neg_list=neg_list) offset = 1 while (offset < init_range) or (len(candidates) < quantity): new_candidates = utils.__get_candidates_with_offset( degrees=degrees, graph=graph, candidate_degree=center + offset, neg_list=neg_list) new_candidates += utils.__get_candidates_with_offset( degrees=degrees, graph=graph, candidate_degree=center - offset, neg_list=neg_list) candidates += new_candidates offset += 1 # priorities candidates from pref_list pref_candidates = list(set(candidates).intersection(set(available_list))) if len(pref_candidates) < quantity: raise ValueError( f"Not all nodes available for sampling nodes with about {center} degrees. Grapg {str(graph)}" ) return pref_candidates[:quantity]
def load_embedding(self, graph: Graph, removed_nodes: [int], save_info: sl.MemoryAccess, iteration: int, load_neg_results: bool = False): target = save_info.get_embedding_name(removed_nodes=removed_nodes, iteration=iteration) target_name = os.path.abspath(target + ".emb") target_name_neg = os.path.abspath(target + "_neg.emb") if load_neg_results: return load_results(target_name=target_name, node_names=graph.nodes()), load_results( target_name=target_name_neg, node_names=graph.nodes()) else: return load_results(target_name=target_name, node_names=graph.nodes())
def load_list_of_training_data(self, removed_node: int, feature_type: ft.FeatureType, num_of_bins: int, graph: gc.Graph, tr_node_list: [int] = None, all_data_available: bool = False, limit_num: int = None) -> pd.DataFrame: training_data = pd.DataFrame() if tr_node_list is not None: available_graph_data = tr_node_list if limit_num is not None and len( available_graph_data) != limit_num: raise ValueError( f"The given training data does not match the number of requrired training data. \n " f"Given tr nodes {available_graph_data}, " f"should be {limit_num} but are {len(available_graph_data)}" ) elif all_data_available: available_graph_data = graph.nodes() else: available_graph_data = self.get_list_of_available_training_data( feature_type=feature_type, num_of_bins=num_of_bins, graph=graph, removed_first_node=removed_node) if limit_num is not None: if len(available_graph_data) < limit_num: raise ValueError( f"numer of avialable graph data is smaller than the limit. \n " f"Num available graphs {available_graph_data}, limit_num {limit_num} " ) available_graph_data = np.random.choice(available_graph_data, limit_num, replace=False) for other_node in available_graph_data: if other_node != removed_node: data = self.load_training_data( removed_nodes=[removed_node, other_node], feature_type=feature_type, num_of_bins=num_of_bins) utils.assert_df_no_nan( data, text=f"removed nodes [{removed_node}, {other_node}]") training_data = training_data.append(data) utils.assert_df_no_nan( training_data, text= f"aggregated training data after appending removed nodes" f" [{removed_node}, {other_node}]") utils.assert_df_no_nan( training_data, text=f"aggregated training data fro removed node {removed_node}") return training_data
def access_vocab(self, graph: gc.Graph, removed_nodes: [int] = None, graph_description: str = None): if removed_nodes is None: removed_nodes = [] assert (all(node not in graph.nodes() for node in removed_nodes)) file_name = self.__get_graph_name(removed_nodes, graph_description) + ".vocab" if not os.path.exists(file_name): # create edge list file nodes = "\n".join(map(lambda node: str(node) + " 0", graph.nodes())) with open(file_name, "w+") as file: file.write(nodes) return file_name
def test_all_sampling_strats(save_info: sl.MemoryAccess, graph: gc.Graph, feature_type: ft.FeatureType, num_of_bins: int): # test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_of_bins) for strat in SamplingStrategy: test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_of_bins, list_nodes_to_predict=graph.nodes(), sampling_strategy=strat)
def compute_degrees(graph: gc.Graph, labels: [int]): degrees = pd.DataFrame(0, labels, ["degree"]) for label in labels: degrees.loc[label] = graph.degree(label) # normalise degrees min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(degrees.astype(float)) return pd.DataFrame(x_scaled, index=labels, columns=["degree"])
def calculate_avg_distance_to_positive_predicted_nodes(graph: gc.Graph, removed_node: int, labels: [int], predicted: [int]): pos_labels = labels[predicted == 1] if len(pos_labels) > 0: return float( sum(map(lambda x: graph.distance(removed_node, x), pos_labels))) / len(pos_labels) else: print(f"no node was predicted to be connected to {removed_node}") return 0
def get_list_of_available_training_data(self, feature_type: ft.FeatureType, num_of_bins: int, graph: gc.Graph, removed_first_node: int = None): files = [] if removed_first_node is not None: removed_nodes = [removed_first_node] else: removed_nodes = [] for node in graph.nodes(): if self.has_training_data(removed_nodes=removed_nodes + [node], feature_type=feature_type, num_of_bins=num_of_bins): files.append(node) assert (all([node in graph.nodes() for node in files])) return files
def __create_degree_column_for_feature(graph: gc.Graph, row_labels: [int]): degrees = pd.DataFrame(0, row_labels, ["degree"]) for label in row_labels: degrees.loc[label] = graph.degree(label) # normalise degrees # min_max_scaler = preprocessing.MinMaxScaler() # x_scaled = min_max_scaler.fit_transform(degrees) degrees = degrees / max(degrees.values) return pd.DataFrame(degrees, index=row_labels, columns=["degree"])
def access_line_edge_list(self, graph: gc.Graph, removed_nodes: [int] = None): if removed_nodes is None: removed_nodes = None assert (graph.name() == self.graph) file_name = self.__get_graph_name( removed_nodes) + ".directedweihtededgelist" if not os.path.exists(file_name): # create edge list edges = "\n".join( list( map( lambda edge: f"{str(edge[0])} {str(edge[1])} 1\n {str(edge[1])} {str(edge[0])} 1", graph.edges()))) with open(file_name, "w+") as file: file.write(edges) return file_name
def get_list_of_available_difference_matrices( self, graph: gc.Graph, removed_first_node: int = None): files = [] if removed_first_node is not None: removed_nodes = [removed_first_node] else: removed_nodes = [] for node in graph.nodes(): if self.has_diff_matrix(removed_nodes=removed_nodes + [node]): files.append(node) return files
def sample_low_avg_high_degree_nodes(graph: gc.Graph, quantity: int, init_range: int = 2, pref_list=None): if pref_list is None: pref_list = [] degrees = graph.all_degrees() min_val: int = min(degrees) max_val: int = max(degrees) avg_val: int = int(round(((max_val - min_val) / 2) + min_val)) # int(round(np.array(degrees).mean())) nodes = graph.nodes() max_sample = __get_sample(graph=graph, degrees=degrees, center=max_val, init_range=init_range, quantity=quantity, pref_list=pref_list, neg_list=[]) min_sample = __get_sample(graph=graph, degrees=degrees, center=min_val, init_range=init_range, quantity=quantity, pref_list=pref_list, neg_list=list(max_sample)) avg_sample = __get_sample(graph=graph, degrees=degrees, center=avg_val, init_range=init_range, quantity=quantity, pref_list=pref_list, neg_list=list(max_sample) + list(min_sample)) # print(f"samles: \n max {max_sample}\n min: {min_sample}\n avg: {avg_sample}") samples = np.concatenate((max_sample, avg_sample, min_sample)) assert (len(set(samples)) == len(samples)) return samples
def __get_sample(graph: gc.Graph, degrees: [int], center, init_range: int, quantity, pref_list: [int], neg_list: [int]) -> np.ndarray: assert (set(pref_list).issubset(set(graph.nodes()))) degrees = np.array(degrees) candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center, neg_list=neg_list) offset = 1 while (offset < init_range) or (len(candidates) < quantity): new_candidates = __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center + offset, neg_list=neg_list) new_candidates += __get_candidates_with_offset(degrees=degrees, graph=graph, candidate_degree=center - offset, neg_list=neg_list) candidates += new_candidates offset += 1 # priorities candidates from pref_list pref_candidates = list(set(candidates).intersection(set(pref_list))) return sample_randomly_with_preferred_list(pref_list=pref_candidates, all_list=candidates, quantity=quantity)
def get_min_avg_max_sample_from_available_list( graph: gc.Graph, quantity: int, available_list: [], init_range: int = 2, ): degrees = graph.all_degrees() min_val: int = min(degrees) max_val: int = max(degrees) avg_val: int = int(round(((max_val - min_val) / 2) + min_val)) # int(round(np.array(degrees).mean())) max_sample = __get_available_sample(graph=graph, degrees=degrees, center=max_val, init_range=init_range, quantity=quantity, available_list=available_list, neg_list=[]) min_sample = __get_available_sample(graph=graph, degrees=degrees, center=min_val, init_range=init_range, quantity=quantity, available_list=available_list, neg_list=list(max_sample)) avg_sample = __get_available_sample(graph=graph, degrees=degrees, center=avg_val, init_range=init_range, quantity=quantity, available_list=available_list, neg_list=list(max_sample) + list(min_sample)) # print(f"samles: \n max {max_sample}\n min: {min_sample}\n avg: {avg_sample}") samples = np.concatenate((max_sample, avg_sample, min_sample)) assert (len(set(samples)) == len(samples)) return samples
def create_target_vector(row_labels: [], graph: gc.Graph, node_to_predict: int) -> pd.DataFrame: """ creates the target vector for classifier :param row_labels: labels the target vector should be created :param graph: graph including the removed node :param node_to_predict: the node that is removed in the 2. embedding :return: """ neighbours_of_removed_node = graph.neighbours(node_to_predict) target = pd.DataFrame(False, row_labels, ["y"]) for neighbour in neighbours_of_removed_node: # this prevents an error in case the original graph is used while 2 nodes are removed in the labels # and they are connected if neighbour in row_labels: target.loc[neighbour] = True return target
def get_list_of_available_embeddings(self, graph: gc.Graph, removed_first_node: int = None, emb_description: str = None, find_started_trainings: bool = False): files = [] if find_started_trainings: iteration = 0 else: iteration = self.num_iterations - 1 if removed_first_node is not None: removed_nodes = [removed_first_node] else: removed_nodes = [] for node in graph.nodes(): if self.has_embedding(removed_nodes=removed_nodes + [node], iteration=iteration, emb_description=emb_description): files.append(node) return files
def get_available_graph_data(graph: gc.Graph, save_info: sl.MemoryAccess, num_of_training_graphs: int): complete_data = {} te_nodes = save_info.get_list_of_available_embeddings( graph=graph, find_started_trainings=False) for te_node in te_nodes: graph_removed_one = graph.delete_node(te_node) second_completed_embeddings = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=te_node, find_started_trainings=False) second_completed_embeddings = filter_by_splitting_nodes( tr_nodes=second_completed_embeddings, graph_rem_one=graph_removed_one) if len(second_completed_embeddings) >= num_of_training_graphs: complete_data[ te_node] = second_completed_embeddings[:num_of_training_graphs] # np.random.choice(a=second_completed_embeddings, size=num_of_training_graphs,replace=False) return complete_data
def get_sample_with_degree(graph: gc.Graph, node_list: [int], degree: int, quantity: int): degrees = np.array([graph.degree(n) for n in node_list]) samples_to_find = quantity candidates = np.where(degrees == degree)[0] if len(candidates) > samples_to_find: np.random.choice(candidates, size=samples_to_find, replace=False) elif len(candidates) < samples_to_find: raise ValueError(f'Not enough training samples required {quantity} got {len(candidates)}') sample = candidates samples_to_find -= len(candidates) offset = 1 while samples_to_find > 0: # print(f"Sampling range {offset}") candidates = np.concatenate([np.where(degrees == (degree + offset))[0], np.where(degrees == (degree - offset))[0]]) if len(candidates) > samples_to_find: candidates = np.random.choice(candidates, size=samples_to_find, replace=False) sample = np.concatenate([sample, candidates]) samples_to_find -= len(candidates) offset += 1 return [node_list[s] for s in sample]
def __filter_splitting_nodes(node_list: List[int], graph: gc.Graph): return list(filter(lambda x: not graph.splits_graph(x), node_list))
def filter_by_splitting_nodes(tr_nodes: [], graph_rem_one: gc.Graph): return list( filter(lambda node: not graph_rem_one.splits_graph(node), tr_nodes))
def __get_graph_degree_properties(graph: gc.Graph): degrees = graph.all_degrees() min_val = min(degrees) max_val = max(degrees) avg = np.mean(degrees) return min_val, max_val, avg
def __compute_training_features_for_one_node(dm_original: pd.DataFrame, node_to_predict: int, save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType, nodes_to_train_on: [int]) -> None: """ :param dm_original: distance matrix of the original graph :param node_to_predict: node that is removed from the graph and should be predicted :param save_info: data access object :param graph: graph the embedding is trained on :param num_of_bins: number of bins that should be used to generate training features :param feature_type: type of the feature vector that is used :param nodes_to_train_on: a list of nodes that are removed from the graph after removing node_to_predict to generate training data """ # --- compute test features for node_to_predict --- # remove node_to_predict from the graph graph_reduced = graph.delete_node(node_to_predict) dm_reduced = calc_avg_distance_matrix(graph=graph_reduced, removed_nodes=[node_to_predict], save_info=save_info) # test if training data is already avialable if save_info.has_training_data([node_to_predict], feature_type=feature_type, num_of_bins=num_of_bins): # print("Training Feature for removed nodes ", [node_to_predict], " and feature type ", # "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained") pass else: # print(f"Compute test features for node {node_to_predict}") diff = cf.create_difference_matrix(dm_original, dm_reduced, removed_nodes=[node_to_predict], save_info=save_info) # compute training data # cf.create_feature_from_diff_bins_with_dim(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, # num_of_bins=num_of_bins, save_info=save_info) cf.create_features(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info) del diff # free RAM # save_info.remove_diff_matrix(removed_nodes=[node_to_predict]) # free memory # --- compute training features for nodes_to_train_on --- # print(f"Create training features for removed node {node_to_predict} by by removing ", nodes_to_train_on) for node in nodes_to_train_on: # check if features already exists if save_info.has_training_data(removed_nodes=[node_to_predict, node], feature_type=feature_type, num_of_bins=num_of_bins): # print("Training Feature for removed nodes ", [node_to_predict, node], " and feature type ", # "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained") pass else: graph_reduced_2 = graph_reduced.delete_node(node) dm_reduced_2 = calc_avg_distance_matrix( graph=graph_reduced_2, removed_nodes=[node_to_predict, node], save_info=save_info) print("odm", type(dm_reduced), "rdm", type(dm_reduced_2)) diff_reduced = cf.create_difference_matrix( dm_reduced, dm_reduced_2, removed_nodes=[node_to_predict, node], save_info=save_info) print("rdiff", type(diff_reduced), "odm", type(dm_reduced), "rdm", type(dm_reduced_2)) del dm_reduced_2 # compute training data cf.create_features(diff=diff_reduced, removed_nodes=[node_to_predict, node], original_graph=graph_reduced, num_of_bins=num_of_bins, save_info=save_info, feature_type=feature_type)
def train_embedding_per_graph( graph: gc.Graph, embedding: Embedding, save_info: sl.MemoryAccess, num_of_embeddings: int = 30, num_of_test_evaluations_per_degree_level: int = 5, num_of_training_graphs: int = 10, num_of_bins_for_tf: [int] = None, run_experiments_on_embedding: bool = True, feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM): assert (num_of_embeddings == save_info.get_num_iterations()) if num_of_bins_for_tf is None: num_of_bins_for_tf = [10] elif isinstance(num_of_bins_for_tf, int): num_of_bins_for_tf = [num_of_bins_for_tf] embedding.train_embedding(graph=graph, save_info=save_info, removed_nodes=[], num_of_embeddings=num_of_embeddings) first_started_embedding = save_info.get_list_of_available_embeddings( graph=graph, find_started_trainings=True) tested_nodes = utils.sample_low_avg_high_degree_nodes( graph=graph, quantity=num_of_test_evaluations_per_degree_level, init_range=2, pref_list=first_started_embedding) print(f"\nTrain Embeddings for nodes {tested_nodes}") nodes_for_training_embedding = {} for index, first_node in enumerate(tested_nodes): # print(f"Start training embedding for {index}({first_node}). node.") graph_removed_one = graph.delete_node(first_node) embedding.train_embedding(graph=graph_removed_one, save_info=save_info, removed_nodes=[first_node], num_of_embeddings=num_of_embeddings) if num_of_training_graphs: second_completed_diffs = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=first_node, find_started_trainings=False) second_started_embedding = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=first_node, find_started_trainings=True) second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes( graph=graph_removed_one, pref_list=second_completed_diffs, secondary_pref_list=second_started_embedding, all_list=graph_removed_one.nodes(), quantity=num_of_training_graphs) else: second_tested_nodes = graph_removed_one.nodes() nodes_for_training_embedding[first_node] = second_tested_nodes # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}") for index2, second_node in enumerate(second_tested_nodes): # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.") graph_removed_two = graph_removed_one.delete_node(second_node) embedding.train_embedding(graph=graph_removed_two, save_info=save_info, removed_nodes=[first_node, second_node], num_of_embeddings=num_of_embeddings) # create features if run_experiments_on_embedding: for num_bins in num_of_bins_for_tf: # try: cf.compute_training_features( save_info=save_info, graph=graph, num_of_bins=num_bins, list_nodes_to_predict=tested_nodes, nodes_to_train_on=nodes_for_training_embedding, feature_type=feature_type) te.test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_bins, limit_num_training_graphs=num_of_training_graphs, list_nodes_to_predict=tested_nodes, nodes_to_train_on=nodes_for_training_embedding) # except Exception as e: # print(f"Failed to compute Training Features or Test. " # f"graph {str(graph)}, " # f"emb {str(embedding)}, " # f"num_bins {num_bins}") # traceback.print_exc() return tested_nodes, nodes_for_training_embedding
def compute_training_features_for_one_node_pool( save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType, nodes_to_train_on: {}, o_dm_list: [pd.DataFrame], node_to_predict: int): ''' Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph :param save_info: :param graph: :param num_of_bins: :param feature_type: :param nodes_to_train_on: :param node_to_predict: :return: ''' num_iter = save_info.get_num_iterations() quantity_dict = { dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS: [num_iter, num_iter, num_iter], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE: [1, num_iter, num_iter] } quantity = quantity_dict[save_info.get_diff_type()] used_emb = save_info.get_diff_type().get_iter() # compute attack features diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict], save_info=save_info, quantity_first=quantity[0], quantity_second=quantity[1], used_emb=used_emb, o_dm_list=o_dm_list) cf.create_features(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info) # compute training features if save_info.is_diff_type( dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE): # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'') o_dm_list_t = [min_r_dm] quantity[1] = 1 else: o_dm_list_t = None g_prime = graph.delete_node(removed_node=node_to_predict) for tr_node in nodes_to_train_on[node_to_predict]: removed_nodes = [node_to_predict, tr_node] diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes, save_info=save_info, quantity_first=quantity[1], quantity_second=quantity[2], used_emb=used_emb, o_dm_list=o_dm_list_t) cf.create_features(diff=diff, removed_nodes=removed_nodes, original_graph=g_prime, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info)
def _get_avg_degree_of_neighbours(graph: gc.Graph, node: int): neighbours_mean = (list( map(lambda n: graph.degree(n), list(graph.neighbours(node))))) return sum(neighbours_mean) / len(neighbours_mean)