def compute_diff_matrix_helper(o_dm_list: List[pd.DataFrame], r_dm_list: List[pd.DataFrame], removed_nodes: List[int], save_info: sl.MemoryAccess, save: bool = True, check_for_existing: bool = True): # check for existing if check_for_existing and save_info.has_diff_matrix( removed_nodes=removed_nodes): return save_info.load_diff_matrix(removed_nodes) min_diff_size = np.inf min_diff: pd.DataFrame min_r_dm: pd.DataFrame min_r_dm_index: int r_dm_list = list( r_dm_list ) # must be called multiple times hence a generator can not be used for o_dm in o_dm_list: o_dm_reduced = cf.reduce_dm(dm_original=o_dm, rem_node=removed_nodes[-1]) for index, r_dm in enumerate(r_dm_list): diff = cf.create_difference_matrix_difference( dm_original=o_dm_reduced, dm_reduced=r_dm, removed_nodes=removed_nodes, save_info=save_info, save=False, check_for_existing=False, reduce_o_dm=False) diff_size = compute_diff_size(diff) if diff_size < min_diff_size: min_diff = diff min_diff_size = diff_size min_r_dm = r_dm min_r_dm_index = index if save: save_info.save_diff_matrix(removed_nodes=removed_nodes, diff=min_diff, diff_type=save_info.diff_type, r_dm_index=min_r_dm_index) return min_diff, min_r_dm
def create_features(diff: pd.DataFrame, removed_nodes: [int], original_graph: gc.Graph, num_of_bins, feature_type: ft.FeatureType, save_info: sl.MemoryAccess, save: bool = True, output_feature: bool = False, check_for_existing: bool = True): if feature_type == ft.FeatureType.DIFF_BIN_WITH_DIM: return create_feature_from_diff_bins_with_dim( diff=diff, removed_nodes=removed_nodes, original_graph=original_graph, num_of_bins=num_of_bins, save_info=save_info, save=save, output_feature=output_feature, check_for_existing=check_for_existing) elif feature_type == ft.FeatureType.DIFF_BIN_WITH_DIM_2_HOP: features, target = create_feature_from_diff_bins_with_dim( diff=diff, removed_nodes=removed_nodes, original_graph=original_graph, num_of_bins=num_of_bins, save_info=save_info, save=False, output_feature=True) target = two_hop_neighbours(nodes=utils.get_row_labels(features), graph=original_graph, node_to_predict=removed_nodes[-1]) if save: save_info.save_training_data( removed_nodes=removed_nodes, feature_type=ft.FeatureType.DIFF_BIN_WITH_DIM_2_HOP, num_of_bins=num_of_bins, training_data=utils.pd_append_column(features, target)) return features, target elif feature_type == ft.FeatureType.EVEN_DIST: raise NotImplementedError() else: raise ValueError(f"Feature type {feature_type} is not known!")
def create_feature_from_diff_bins(diff: pd.DataFrame, removed_nodes: [int], original_graph: gc.Graph, num_of_bins: int, save_info: sl.MemoryAccess, save: bool = True): target = create_target_vector(utils.get_row_labels(diff), original_graph, removed_nodes[-1]) # calculate bin distribution for all labels features = get_features_from_bins(diff=diff, num_of_bins=num_of_bins) if save: save_info.save_training_data( removed_nodes, feature_type=ft.FeatureType.DIFF_BIN_WITH_DIM, num_of_bins=num_of_bins, training_data=utils.pd_append_column(features, target)) return features, target
def load_dms(removed_nodes: List[int], save_info: sl.MemoryAccess, num_iterations: int, use_specific_iter: int = None): if num_iterations == 1: assert (use_specific_iter is not None) if save_info.has_distance_matrix(removed_nodes=removed_nodes, iteration=use_specific_iter): yield save_info.load_distance_matrix(removed_nodes=removed_nodes, iteration=use_specific_iter) else: emb = save_info.load_embedding(removed_nodes=removed_nodes, iteration=use_specific_iter) dm = cdm.calc_distances(model=emb, save_info=save_info, removed_nodes=removed_nodes, iteration=use_specific_iter) yield dm else: for i in range(num_iterations): if save_info.has_distance_matrix(removed_nodes=removed_nodes, iteration=i): yield save_info.load_distance_matrix( removed_nodes=removed_nodes, iteration=i) else: emb = save_info.load_embedding(removed_nodes=removed_nodes, iteration=i) dm = cdm.calc_distances(model=emb, save_info=save_info, removed_nodes=removed_nodes, iteration=i) yield dm
def __train_embedding(self, dense_edge_list: [int], save_info: sl.MemoryAccess, removed_nodes: [int], iteration: int, check_for_existing: bool = True): target_name = save_info.get_embedding_name(removed_nodes=removed_nodes, iteration=iteration) target_emb = target_name + ".emb" if check_for_existing and os.path.exists(target_emb): #print(f"Embedding for removed nodes {removed_nodes} and iteration {iteration} already exists.") return else: first_order_emb = target_name + "_order_1.emb" second_order_emb = target_name + "_order_2.emb" norm_first_order_emb = target_name + "_order_1_normalised.emb" norm_second_order_emb = target_name + "_order_2_normalised.emb" # execute embedding wd = os.getcwd() os.chdir(LINE_FOLDER) assert (os.path.exists(dense_edge_list)) print("dense_edge_list", dense_edge_list) print("first_order_emb", first_order_emb) print("num cores", config.NUM_CORES) print("size", str(self.dim / 2)) subprocess.call( f'./line -train "{dense_edge_list}" -output "{first_order_emb}" -size \ {str(self.dim/2)} -order 1 -binary 1 -threads {config.NUM_CORES}', shell=True) subprocess.call( f'./line -train "{dense_edge_list}" -output "{second_order_emb}" -size \ {str(self.dim/2)} -order 2 -binary 1 -threads {config.NUM_CORES}', shell=True) subprocess.call( f'./normalize -input "{first_order_emb}" -output "{norm_first_order_emb}" -binary 1', shell=True) subprocess.call( f'./normalize -input "{second_order_emb}" -output "{norm_second_order_emb}" -binary 1', shell=True) subprocess.call( f'./concatenate -input1 "{norm_first_order_emb}" -input2 "{norm_second_order_emb}" -output "{target_emb}" -binary 1', shell=True) os.chdir(wd) # remove unnecessary files to save memory os.remove(first_order_emb) os.remove(second_order_emb) os.remove(norm_first_order_emb) os.remove(norm_second_order_emb) assert (os.path.exists(target_emb))
def filter_by_already_trained_nodes(p_node_list: [int], t_node_dict: {int: [int]}, graph: gc.Graph, save_info: sl.MemoryAccess, feature_type: ft.FeatureType, num_bins: int): ''' ths function filters test and training features that have already been trained from the list_nodes_to_predict and nodes_to_train_on :param p_node_list: the list contain all nodes for which training feature should be computed :param t_node_dict: dict that contains a mapping from a first node the list of second nodes where training features should be comuted of :param save_info: memory management class to access files :param feature_type: type of the training featues that should be created :param num_bins: number of bins the feature should contain :return: ''' np.testing.assert_array_equal(p_node_list, list(t_node_dict.keys())) new_nodes_to_train_on = {} new_list_nodes_to_predict = [] for node_to_predict in p_node_list: tr_nodes_without_features = list( filter( lambda node: not save_info.has_training_data( removed_nodes=[node_to_predict, node], feature_type=feature_type, num_of_bins=num_bins), t_node_dict[node_to_predict])) if len(tr_nodes_without_features) == 0 and save_info.has_training_data( removed_nodes=[node_to_predict], feature_type=feature_type, num_of_bins=num_bins): pass else: new_list_nodes_to_predict.append(node_to_predict) new_nodes_to_train_on[node_to_predict] = tr_nodes_without_features return new_list_nodes_to_predict, new_nodes_to_train_on
def create_difference_matrix_ratio( dm_original: pd.DataFrame, dm_reduced: pd.DataFrame, removed_nodes: [int], save_info: sl.MemoryAccess, save: bool = True, check_for_existing: bool = True) -> pd.DataFrame: if not save_info.is_diff_type(dt.DiffType.RATIO): raise ValueError( f"MemoryAccess object does not specify a difference type. To run this function" f"the diff type must be diff type '{dt.DiffType.DIFFERENCE}'") if check_for_existing and save_info.has_diff_matrix(removed_nodes): print("difference matrix for removed nodes {} and\ num iterations {} and type {} already exists!".format( removed_nodes, save_info.num_iterations, dt.DiffType.RATIO)) return save_info.load_diff_matrix(removed_nodes) # reduce original dem to match red dm dm_o = reduce_dm(dm_original=dm_original, rem_node=list(dm_reduced.index)) assert (removed_nodes[-1] not in list(dm_o.index)) # utils.assure_same_labels([dm_o, dm_reduced], # f"Checking of original distance matrix (removed nodes {removed_nodes[:-1]}) \ # and reduced distance matrix # (removed nodes {removed_nodes}) have the same labels \ # after removing the last label from the ordginal distance matrix") ratio = dm_o / dm_reduced if save: save_info.save_diff_matrix(removed_nodes, ratio, diff_type=dt.DiffType.RATIO) return ratio
def get_available_graph_data(graph: gc.Graph, save_info: sl.MemoryAccess, num_of_training_graphs: int): complete_data = {} te_nodes = save_info.get_list_of_available_embeddings( graph=graph, find_started_trainings=False) for te_node in te_nodes: graph_removed_one = graph.delete_node(te_node) second_completed_embeddings = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=te_node, find_started_trainings=False) second_completed_embeddings = filter_by_splitting_nodes( tr_nodes=second_completed_embeddings, graph_rem_one=graph_removed_one) if len(second_completed_embeddings) >= num_of_training_graphs: complete_data[ te_node] = second_completed_embeddings[:num_of_training_graphs] # np.random.choice(a=second_completed_embeddings, size=num_of_training_graphs,replace=False) return complete_data
def calc_distances(model, save_info: sl.MemoryAccess, removed_nodes: [int], iteration: int, graph: gc.Graph = None, save: bool = True, check_for_existing: bool = True): if check_for_existing and save_info.has_distance_matrix( removed_nodes, iteration): return save_info.load_distance_matrix(removed_nodes, iteration) dm = calc_distances_based_on_gensim_fast(model=model) """ if type(model) == pd.DataFrame: dm = __calc_distances_based_on_df(embedding=model) else: dm = calc_distances_based_on_gensim_fast(model=model) # dm = __calc_distances_based_on_gensim(model=model, node_names=node_names) """ if save: save_info.save_distance_matrix(removed_nodes, iteration, dm) return dm
def load_embedding(self, graph: Graph, removed_nodes: [int], save_info: sl.MemoryAccess, iteration: int, load_neg_results: bool = False): target = save_info.get_embedding_name(removed_nodes=removed_nodes, iteration=iteration) target_name = os.path.abspath(target + ".emb") target_name_neg = os.path.abspath(target + "_neg.emb") if load_neg_results: return load_results(target_name=target_name, node_names=graph.nodes()), load_results( target_name=target_name_neg, node_names=graph.nodes()) else: return load_results(target_name=target_name, node_names=graph.nodes())
def create_difference_matrix(dm_original: pd.DataFrame, dm_reduced: pd.DataFrame, removed_nodes: [int], save_info: sl.MemoryAccess, save: bool = True, check_for_existing: bool = True) -> pd.DataFrame: diff_type = save_info.get_diff_type() type_to_func = { dt.DiffType.DIFFERENCE: create_difference_matrix_difference, dt.DiffType.DIFFERENCE_ONE_INIT: create_difference_matrix_difference, dt.DiffType.RATIO: create_difference_matrix_ratio } return type_to_func[diff_type](dm_original=dm_original, dm_reduced=dm_reduced, removed_nodes=removed_nodes, save_info=save_info, save=save, check_for_existing=check_for_existing)
def train_embedding_per_graph( graph: gc.Graph, embedding: Embedding, save_info: sl.MemoryAccess, num_of_embeddings: int = 30, num_of_test_evaluations_per_degree_level: int = 5, num_of_training_graphs: int = 10, num_of_bins_for_tf: [int] = None, run_experiments_on_embedding: bool = True, feature_type: ft.FeatureType = ft.FeatureType.DIFF_BIN_WITH_DIM): assert (num_of_embeddings == save_info.get_num_iterations()) if num_of_bins_for_tf is None: num_of_bins_for_tf = [10] elif isinstance(num_of_bins_for_tf, int): num_of_bins_for_tf = [num_of_bins_for_tf] embedding.train_embedding(graph=graph, save_info=save_info, removed_nodes=[], num_of_embeddings=num_of_embeddings) first_started_embedding = save_info.get_list_of_available_embeddings( graph=graph, find_started_trainings=True) tested_nodes = utils.sample_low_avg_high_degree_nodes( graph=graph, quantity=num_of_test_evaluations_per_degree_level, init_range=2, pref_list=first_started_embedding) print(f"\nTrain Embeddings for nodes {tested_nodes}") nodes_for_training_embedding = {} for index, first_node in enumerate(tested_nodes): # print(f"Start training embedding for {index}({first_node}). node.") graph_removed_one = graph.delete_node(first_node) embedding.train_embedding(graph=graph_removed_one, save_info=save_info, removed_nodes=[first_node], num_of_embeddings=num_of_embeddings) if num_of_training_graphs: second_completed_diffs = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=first_node, find_started_trainings=False) second_started_embedding = save_info.get_list_of_available_embeddings( graph=graph_removed_one, removed_first_node=first_node, find_started_trainings=True) second_tested_nodes = utils.sample_randomly_with_pref_list_without_splitting_nodes( graph=graph_removed_one, pref_list=second_completed_diffs, secondary_pref_list=second_started_embedding, all_list=graph_removed_one.nodes(), quantity=num_of_training_graphs) else: second_tested_nodes = graph_removed_one.nodes() nodes_for_training_embedding[first_node] = second_tested_nodes # print(f"\nTrain embeddings for removed node {first_node} and {second_tested_nodes}") for index2, second_node in enumerate(second_tested_nodes): # print(f"Start train embedding {index2}({second_node}) for for {index}({first_node}). node.") graph_removed_two = graph_removed_one.delete_node(second_node) embedding.train_embedding(graph=graph_removed_two, save_info=save_info, removed_nodes=[first_node, second_node], num_of_embeddings=num_of_embeddings) # create features if run_experiments_on_embedding: for num_bins in num_of_bins_for_tf: # try: cf.compute_training_features( save_info=save_info, graph=graph, num_of_bins=num_bins, list_nodes_to_predict=tested_nodes, nodes_to_train_on=nodes_for_training_embedding, feature_type=feature_type) te.test(save_info=save_info, graph=graph, feature_type=feature_type, num_of_bins=num_bins, limit_num_training_graphs=num_of_training_graphs, list_nodes_to_predict=tested_nodes, nodes_to_train_on=nodes_for_training_embedding) # except Exception as e: # print(f"Failed to compute Training Features or Test. " # f"graph {str(graph)}, " # f"emb {str(embedding)}, " # f"num_bins {num_bins}") # traceback.print_exc() return tested_nodes, nodes_for_training_embedding
def train_node2vec_embedding(edge_list_path: str, graph: Graph, save_info: sl.MemoryAccess, removed_nodes: [int], iteration: int, epochs: int, dim: int, walk_length: int, num_of_walks_per_node: int, window_size: int, alpha: float, return_embedding: bool = False, check_for_existing: bool = True): target = save_info.get_embedding_name(removed_nodes=removed_nodes, iteration=iteration) if check_for_existing and os.path.exists(target + ".emb"): #print('Embedding for removed nodes {} and iteration {} already exists'.format(removed_nodes, iteration)) if return_embedding: return save_info.load_embedding(removed_nodes=removed_nodes, iteration=iteration) else: target_path = os.path.abspath(target + "_path.emb") # create walks # execute path training wd = os.getcwd() os.chdir(config.NODE2VEC_SNAP_DIR) subprocess.call('./node2vec \ -i:"' + edge_list_path + '" \ -o:"' + target_path + '" \ -e:' + str(epochs) + " -d:" + str(dim) + " -l:" + str(walk_length) + " -r:" + str(num_of_walks_per_node) + " -k:" + str(window_size) + " -ow", shell=True) # output random walks only os.chdir(wd) # end create paths class Walks: def __init__(self, file): self.file = file def __iter__(self): with open(target_path, "r") as f: for line in f: line = line.strip("\n").split(" ") # assert (all(list(map(lambda node: node in graph.nodes(), list(map(int, line)))))) yield line walks = Walks(target_path) # train word2vec emb_result = gensim.models.Word2Vec(walks, size=dim, iter=epochs, window=window_size, min_count=1, sg=1, workers=config.NUM_CORES, alpha=alpha) os.remove(target_path) save_info.save_embedding(removed_nodes, iteration, emb_result) if return_embedding: return emb_result
class DNCCell(tf.nn.rnn_cell.RNNCell): def __init__(self, controller_cell, memory_size=256, word_size=64, num_reads=4, num_writes=1, clip_value=None): """ controller_cell: Tensorflow RNN Cell """ self.memory = MemoryAccess(memory_size, word_size, num_reads, num_writes) self.controller = controller_cell self._clip_value = clip_value or 0 @property def state_size(self): return DNCStateTuple(controller_state=self.controller.state_size, access_state=self.memory.state_size, read_vectors=self.memory.output_size) @property def output_size(self): return self.controller.output_size + self.memory.output_size def zero_state(self, batch_size, dtype): return DNCStateTuple( controller_state=self.controller.zero_state(batch_size, dtype), access_state=self.memory.zero_state(batch_size, dtype), read_vectors=tf.zeros([ batch_size, ] + [ self.memory.output_size, ], tf.float32)) def _clip_if_enabled(self, x): if self._clip_value <= 0: return x return tf.clip_by_value(x, -self._clip_value, self._clip_value) def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): controller_state, access_state, read_vectors = state #concatenate last read vectors complete_input = tf.concat([inputs, read_vectors], -1) #processes input data through the controller network controller_output, controller_state = self.controller( complete_input, controller_state) controller_output = self._clip_if_enabled(controller_output) #processes input data through the memory module read_vectors, access_state = self.memory(controller_output, access_state) read_vectors = self._clip_if_enabled(read_vectors) #the final output by taking rececnt memory changes into account step_out = tf.concat([controller_output, read_vectors], -1) #return output and teh new DNC state return step_out, DNCStateTuple(controller_state=controller_state, access_state=access_state, read_vectors=read_vectors)
def __compute_training_features_for_one_node(dm_original: pd.DataFrame, node_to_predict: int, save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType, nodes_to_train_on: [int]) -> None: """ :param dm_original: distance matrix of the original graph :param node_to_predict: node that is removed from the graph and should be predicted :param save_info: data access object :param graph: graph the embedding is trained on :param num_of_bins: number of bins that should be used to generate training features :param feature_type: type of the feature vector that is used :param nodes_to_train_on: a list of nodes that are removed from the graph after removing node_to_predict to generate training data """ # --- compute test features for node_to_predict --- # remove node_to_predict from the graph graph_reduced = graph.delete_node(node_to_predict) dm_reduced = calc_avg_distance_matrix(graph=graph_reduced, removed_nodes=[node_to_predict], save_info=save_info) # test if training data is already avialable if save_info.has_training_data([node_to_predict], feature_type=feature_type, num_of_bins=num_of_bins): # print("Training Feature for removed nodes ", [node_to_predict], " and feature type ", # "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained") pass else: # print(f"Compute test features for node {node_to_predict}") diff = cf.create_difference_matrix(dm_original, dm_reduced, removed_nodes=[node_to_predict], save_info=save_info) # compute training data # cf.create_feature_from_diff_bins_with_dim(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, # num_of_bins=num_of_bins, save_info=save_info) cf.create_features(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info) del diff # free RAM # save_info.remove_diff_matrix(removed_nodes=[node_to_predict]) # free memory # --- compute training features for nodes_to_train_on --- # print(f"Create training features for removed node {node_to_predict} by by removing ", nodes_to_train_on) for node in nodes_to_train_on: # check if features already exists if save_info.has_training_data(removed_nodes=[node_to_predict, node], feature_type=feature_type, num_of_bins=num_of_bins): # print("Training Feature for removed nodes ", [node_to_predict, node], " and feature type ", # "diff_bins_num:" + str(num_of_bins) + "and_norm_dim", "is already trained") pass else: graph_reduced_2 = graph_reduced.delete_node(node) dm_reduced_2 = calc_avg_distance_matrix( graph=graph_reduced_2, removed_nodes=[node_to_predict, node], save_info=save_info) print("odm", type(dm_reduced), "rdm", type(dm_reduced_2)) diff_reduced = cf.create_difference_matrix( dm_reduced, dm_reduced_2, removed_nodes=[node_to_predict, node], save_info=save_info) print("rdiff", type(diff_reduced), "odm", type(dm_reduced), "rdm", type(dm_reduced_2)) del dm_reduced_2 # compute training data cf.create_features(diff=diff_reduced, removed_nodes=[node_to_predict, node], original_graph=graph_reduced, num_of_bins=num_of_bins, save_info=save_info, feature_type=feature_type)
def compute_training_features(save_info: sl.MemoryAccess, graph: gc.Graph, list_nodes_to_predict: [int], nodes_to_train_on: {}, num_of_bins: int, feature_type: ft.FeatureType = None, num_eval_iterations: int = None): """ :param save_info: memory access obj :param graph: graph the embedding is trained on (used to access nodes lists) :param num_of_bins: number of bins the feature vector should use :param feature_type: type of the features to compute :param list_nodes_to_predict: nodes that are used as test_cases. If None nodes are determined by available files in the file system :param nodes_to_train_on: nodes that are used for training in each test case. Dict from the node_to_predict to [int] containin the training nodes for that tested node. If None """ print( f"Compute training features on diff type {save_info.get_diff_type()} and graph {str(graph)} " f"on nodes {list_nodes_to_predict} " f" graph embedding {str(save_info.embedding_type)}") if save_info.get_diff_type().has_one_init_graph(): if num_eval_iterations is None: iteration_values = list(range(save_info.get_num_iterations())) else: iteration_values = list(range(num_eval_iterations)) else: iteration_values = [-1] if feature_type is None: feature_type = ft.FeatureType.DIFF_BIN_WITH_DIM for diff_iter in iteration_values: if diff_iter != -1: save_info.get_diff_type().set_iter(diff_iter) p_nodes = list_nodes_to_predict t_nodes = nodes_to_train_on if save_info.get_diff_type() in [ dt.DiffType.MOST_SIMILAR_EMBS_DIFF, dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS, dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT, dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE ]: exp_sim_diff.compute_training_features_from_similarity_diff( save_info=save_info, graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, p_nodes=p_nodes, t_nodes=t_nodes) else: num_features = len(p_nodes) p_nodes, t_nodes = exp_utils.filter_by_already_trained_nodes( p_node_list=p_nodes, t_node_dict=t_nodes, graph=graph, save_info=save_info, feature_type=feature_type, num_bins=num_of_bins) if len(p_nodes) > 0: # compute distance matrix of the original graph if save_info.get_diff_type( ) == dt.DiffType.DIFFERENCE_ONE_INIT: emb_number = save_info.get_diff_type().get_iter() if emb_number == -1 or emb_number is None: raise ValueError( f"The selected Difference Type requires an iteration number. " f"E.g. dt.DiffType.DIFFERENCE_ONE_INIT.set_iter(0)." ) _, dm_original = __calc_dm(graph=graph, removed_nodes=[], save_info=save_info, i=emb_number) elif save_info.get_diff_type() == dt.DiffType.DIFFERENCE: dm_original = calc_avg_distance_matrix(graph=graph, removed_nodes=[], save_info=save_info) else: raise ValueError( f"Invalid Difference Type: {save_info.get_diff_type()}" ) func_p = functools.partial( __compute_training_features_for_one_node_pool, dm_original, save_info, graph, num_of_bins, feature_type, t_nodes) with multiprocessing.Pool(min(config.NUM_CORES, len(p_nodes))) as pool: for res in pool.imap(func_p, p_nodes): pass ''' for i in p_nodes: func_p(i) ''' else: if num_features == 0: raise ValueError( "no embeddings found to create training features for") else: print( f"All features are already trained. Number of training features {num_features}" )
def compute_training_features_from_similarity_diff( save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType = None, p_nodes: [int] = None, t_nodes: {} = None): """ Computes training and test features generated from difference matrix generated by similarity diff matrix :param save_info: memory access obj :param graph: graph the embedding is trained on (used to access nodes lists) :param num_of_bins: number of bins the feature vector should use :param feature_type: type of the features to compute :param p_nodes: nodes that are used as test_cases. If None nodes are determined by available files in the file system :param t_nodes: nodes that are used for training in each test case. Dict from the node_to_predict to [int] containin the training nodes for that tested node. If None """ print('start sequence part') start_squence = time.time() num_features = len(p_nodes) p_nodes, t_nodes = exp_utils.filter_by_already_trained_nodes( p_node_list=p_nodes, t_node_dict=t_nodes, graph=graph, save_info=save_info, feature_type=feature_type, num_bins=num_of_bins) if len(p_nodes) > 0: if save_info.get_diff_type().has_iteration(): assert (save_info.get_diff_type().has_one_init_graph()) iteration = save_info.get_diff_type().get_iter() o_dm_list = dmm.load_dms(removed_nodes=[], save_info=save_info, num_iterations=1, use_specific_iter=iteration) else: o_dm_list = dmm.load_dms( removed_nodes=[], save_info=save_info, num_iterations=save_info.get_num_iterations()) func_p = functools.partial(compute_training_features_for_one_node_pool, save_info, graph, num_of_bins, feature_type, t_nodes, list(o_dm_list)) end_sequence = time.time() print(f'Sequence duration {end_sequence - start_squence}') start_pool = time.time() with multiprocessing.Pool(min(config.NUM_CORES, len(p_nodes))) as pool: for res in pool.imap(func_p, p_nodes): pass end_pool = time.time() print(f'Pool duration {end_pool - start_pool}') ''' for i in list_nodes_to_predict: func_p(i) ''' else: if num_features == 0: print("no embeddings found to create training features for") else: print( f"All features are already trained. Number of training features {num_features}" )
def compute_training_features_for_one_node_pool( save_info: sl.MemoryAccess, graph: gc.Graph, num_of_bins: int, feature_type: ft.FeatureType, nodes_to_train_on: {}, o_dm_list: [pd.DataFrame], node_to_predict: int): ''' Compute features using most similiar embeddings. Thereby it only uses multiple embeddings for the second graph :param save_info: :param graph: :param num_of_bins: :param feature_type: :param nodes_to_train_on: :param node_to_predict: :return: ''' num_iter = save_info.get_num_iterations() quantity_dict = { dt.DiffType.MOST_SIMILAR_EMBS_DIFF: [1, num_iter, 1], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ALL_EMBS: [num_iter, num_iter, num_iter], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT: [1, num_iter, num_iter], dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE: [1, num_iter, num_iter] } quantity = quantity_dict[save_info.get_diff_type()] used_emb = save_info.get_diff_type().get_iter() # compute attack features diff, min_r_dm = dmm.compute_diff_matrix(removed_nodes=[node_to_predict], save_info=save_info, quantity_first=quantity[0], quantity_second=quantity[1], used_emb=used_emb, o_dm_list=o_dm_list) cf.create_features(diff=diff, removed_nodes=[node_to_predict], original_graph=graph, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info) # compute training features if save_info.is_diff_type( dt.DiffType.MOST_SIMILAR_EMBS_DIFF_ONE_INIT_CONTINUE): # this diff type uses the dm of G' used for diff(G,G') for diff(G',G'') o_dm_list_t = [min_r_dm] quantity[1] = 1 else: o_dm_list_t = None g_prime = graph.delete_node(removed_node=node_to_predict) for tr_node in nodes_to_train_on[node_to_predict]: removed_nodes = [node_to_predict, tr_node] diff, i = dmm.compute_diff_matrix(removed_nodes=removed_nodes, save_info=save_info, quantity_first=quantity[1], quantity_second=quantity[2], used_emb=used_emb, o_dm_list=o_dm_list_t) cf.create_features(diff=diff, removed_nodes=removed_nodes, original_graph=g_prime, num_of_bins=num_of_bins, feature_type=feature_type, save_info=save_info)
def test(save_info: sl.MemoryAccess, graph: gc.Graph, feature_type: ft.FeatureType, num_of_bins: int, list_nodes_to_predict: List[int], nodes_to_train_on: Dict[int, List[int]], classifier: [] = None, sampling_strategy=None, save: bool = True, limit_num_training_graphs: int = 10, check_for_existing: bool = True, num_eval_iterations: int = None): if save_info.get_diff_type().has_one_init_graph(): if num_eval_iterations is None: diff_iter = range(save_info.get_num_iterations()) else: diff_iter = range(num_eval_iterations) else: diff_iter = [-1] for i in diff_iter: if i >= 0: save_info.get_diff_type().set_iter(i) if classifier is None: classifier = [ KNeighborsClassifier(), SVC(kernel="linear", probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GaussianNB() ] # , MLPClassifier()] target_overall_file_name = get_overall_results_name( feature_type=feature_type.to_str(num_of_bins), sampling_strategy=sampling_strategy, diff_type=save_info.diff_type, num_iterations=save_info.num_iterations, num_tr_graphs_limit=limit_num_training_graphs) # check if embedding is already trained if check_for_existing and full_test_results_available( target_overall_file_name=target_overall_file_name, save_info=save_info, classifier=classifier): continue """ if list_nodes_to_predict is None: raise ValueError("Safty Error: the list of nodes to predict is not given.") list_nodes_to_predict = save_info.get_list_of_available_training_data(graph=graph, feature_type=feature_type, num_of_bins=num_of_bins) """ assert (len(list_nodes_to_predict) > 0) print(f"data is available for nodes: {list_nodes_to_predict}") overall_results = pd.DataFrame() for c in classifier: results_per_node = pd.DataFrame() # train_labels = [] # train_predicted = [] # train_probabilities = [] # test_labels = [] # test_predicted = [] # test_probabilities = [] exp_per_node = functools.partial(test_per_node, nodes_to_train_on, graph, save_info, feature_type, num_of_bins, limit_num_training_graphs, sampling_strategy, c) with multiprocessing.Pool( min(config.NUM_CORES, len(list_nodes_to_predict))) as pool: for res in pool.imap(exp_per_node, list_nodes_to_predict): results_per_node[res[1]] = res[0] if sampling_strategy: sampling_str = f"_sampling={sampling_strategy.to_str(num_of_bins)}" else: sampling_str = "" if limit_num_training_graphs is not None: str_limit = f"_num_tr_graphs_{limit_num_training_graphs}" else: str_limit = "" if save: save_info.save_test_results( results_per_node.T, f"TestResults_ft={feature_type.to_str(num_of_bins)}_Cassifier=" + str(save_info.diff_type) + str(c).split("(")[0] + sampling_str + f"_num_iterations_{save_info.num_iterations}" + str_limit) results_per_classifier = _create_test_results_over_all_experiments( results_per_node) overall_results[str(c).split("(")[0]] = pd.Series( results_per_classifier.T) if save: save_info.save_test_results(results=overall_results, name=target_overall_file_name) print(f"Graph {save_info.graph}, " f"emb {save_info.embedding_type}, " f"dt {save_info.get_diff_type().to_str()}, " f"ft {feature_type.to_str(num_of_bins)}, " f"limit_tr_graphs {limit_num_training_graphs}") print(overall_results)