def get_max_nodes_count(graph_meta_data_of_num): node_counts = [] for graph_path, class_lbl in graph_meta_data_of_num.itervalues(): G = pz.load(graph_path) node_counts.append(G.number_of_nodes()) return max(node_counts)
def get_max_nodes_count(graph_meta_data_of_num): node_counts = [] for graph_path, class_lbl in graph_meta_data_of_num.itervalues(): G = pz.load(graph_path) node_counts.append(G.number_of_nodes()) return max(node_counts)
def compute_kernel_mat(graph_meta_data_of_num, param_range = [None]): kernel_mat_comp_start_time = time.time() kernel_mat_comp_time_of_param = {} kernel_mat_of_param = {} num_graphs = len(graph_meta_data_of_num) # graph_meta_data = graph_meta_data_of_num.values() kernel_mat = np.zeros((num_graphs, num_graphs), dtype = np.float64) # decaying factor lambda_ for down_weighting longer walks # lambda_ = get_lambda(graph_meta_data_of_num) LAMBDA = -4 #============================================================================= # 1) precompute the (sparse) adjacency matrices of the graphs in the dataset #============================================================================= adj_mats = [] # for i in xrange(num_graphs): for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # !! # if i % 10 == 0: # print i # load graph G = pz.load(graph_path) # determine its adjacency matrix A = nx.adj_matrix(G, weight = None) adj_mats.append(A) #============================================================================= # 2) compute kernel matrix over all graphs in the dataset #============================================================================= for i in xrange(num_graphs): A_i = adj_mats[i].todense() for j in xrange(i, num_graphs): A_j = adj_mats[j].todense() # !! # sys.modules['__main__'].A_j = A_j # apply preconditioned conjugate gradient method b = np.ones((A_i.shape[0] * A_j.shape[0], 1)) x, flag, relres, iter_, resvec \ = pcg.pcg(lambda x: smtfilter(x, A_i, A_j, LAMBDA), b, 1e-6, 20) kernel_mat[i,j] = np.sum(x) if i != j: kernel_mat[j,i] = kernel_mat[i,j] # # !! ## sys.modules['__main__'].kernel_mat = kernel_mat # print 'i =', i, 'j =', j print 'i =', i, 'j =', j, kernel_mat[i,j] kernel_mat_of_param[None] = kernel_mat kernel_mat_comp_end_time = time.time() kernel_mat_comp_time_of_param[None] = kernel_mat_comp_end_time \ - kernel_mat_comp_start_time return kernel_mat_of_param, kernel_mat_comp_time_of_param
def extract_features(graph_meta_data_of_num, node_del_fracs): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} time_to_subtract_of_param = defaultdict(int) mat_constr_times = [] num_graphs = len(graph_meta_data_of_num) avg_nodes_count = get_avg_nodes_count(graph_meta_data_of_num) # max_nodes_count = get_max_nodes_count(graph_meta_data_of_num) # avg_nodes_count = get_max_nodes_count(graph_meta_data_of_num) feature_mat = np.zeros((num_graphs, int(avg_nodes_count)), dtype = np.float64) submat_col_count_of_node_del_frac = {} for node_del_frac in node_del_fracs: submat_col_count_of_node_del_frac[node_del_frac] \ = int(node_del_frac * avg_nodes_count) node_del_fracs_desc_order = sorted(node_del_fracs, reverse = True) # first_eig_val_no_conv = False conv_count = 0 no_conv_count = 0 #============================================================================= # 1) extract features iterating over all graphs in the dataset #============================================================================= for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # !! # if i % 10 == 0: # print i # load graph G = pz.load(graph_path) # import sys # sys.modules['__main__'].G = G # determine its adjacency matrix # A = utils.get_adjacency_matrix(G) A = nx.adj_matrix(G, weight = None).astype('d') # calculate adjacency matrix of the undirected version of G if nx.is_directed(G): A = A + A.T # import sys # sys.modules['__main__'].A = A nodes_count = len(G.node) upd_row_idx_of_orig_row_idx = dict(izip(xrange(nodes_count), xrange(nodes_count))) # get pairs (node_num, degree) sorted by degree in ascending order node_num_degree_pairs = get_node_num_degree_pairs(G) j = 0 last_j = -1 speed = 1 while j < min(nodes_count, int(avg_nodes_count)): # while j < nodes_count: sys.stdout.write('i = ' + str(i) + ' (|V| = ' + str(nodes_count)\ + '), j = ' + str(j) + ': ') inner_loop_start_time = time.time() # store largest eigenvalue of A in feature matrix # feature_mat[i,j] = eigvalsh(A)[-1] try: feature_mat[i, j] = eigsh(A, which = 'LA', k = 1, maxiter = 20*A.shape[0], return_eigenvectors = False) # feature_mat[i,j] = eigs(A, which = 'LR', k = 1, # maxiter = 20*A.shape[0], # return_eigenvectors = False) # algorithm converged print(str(feature_mat[i,j])) # if first_eig_val_no_conv: # feature_mat[i, :j] = feature_mat[i, j] # first_eig_val_no_conv = False if j == 0: last_j = 0 conv_count += 1 except (ArpackError, ArpackNoConvergence): # if j == 0: # first_eig_val_no_conv = True # else: if j > 0: feature_mat[i, j] = feature_mat[i, j - 1] print(str(feature_mat[i, j - 1]) + ' [NO CONVERGENCE]') no_conv_count += 1 if last_j < 0: # no iteration with convergence so far if j > 0: speed *= 2 else: feature_mat[i, last_j + 1: j] = feature_mat[i, j] if abs(feature_mat[i, j] - feature_mat[i, last_j]) > 1e-5: last_j = j speed = 1 else: if j > 0: speed *= 2 # determine the node number, which corresponds to the node with # smallest degree, and remove the corresponding row and column of # the (original) adjacency matrix of G # !! better mathematical term for k in xrange(j, min(j + speed, nodes_count, int(avg_nodes_count))): if A.shape[0] <= 2: break node_num_smallest_deg = node_num_degree_pairs[k][0] del_idx = upd_row_idx_of_orig_row_idx[node_num_smallest_deg] A = del_row_and_col_at_idx(A, del_idx) upd_row_idx_of_orig_row_idx = update_row_idxs( upd_row_idx_of_orig_row_idx, node_num_smallest_deg) inner_loop_end_time = time.time() inner_loop_time = inner_loop_end_time - inner_loop_start_time for node_del_frac in node_del_fracs_desc_order: if k >= submat_col_count_of_node_del_frac[node_del_frac]: time_to_subtract_of_param[node_del_frac] \ += inner_loop_time else: break if A.shape[0] <= 2: break if (j < min(nodes_count, int(avg_nodes_count)) - 1) \ and (j + speed) >= min(nodes_count, int(avg_nodes_count)): feature_mat[i, j + 1:] = feature_mat[i, j] j += speed # !! # import sys # sys.modules['__main__'].G = G # sys.modules['__main__'].A = A # sys.modules['__main__'].F = feature_mat # x = 0 # eigvalsh(A) # # for j in xrange(feature_mat.shape[1]): # largest_eigen_val = eigvalsh(A)[-1] # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] # feature_mat = csr_matrix((np.array(feature_counts), np.array(features), # np.array(feature_ptr)), # shape = (len(graph_meta_data_of_num), # len(compr_func)), dtype = np.float64) extr_end_time = time.time() extr_time = extr_end_time - extr_start_time mat_constr_start_time = time.time() for node_del_frac in node_del_fracs: mat_constr_start_time = time.time() submat_col_count = submat_col_count_of_node_del_frac[node_del_frac] feature_mat_of_param[node_del_frac] \ = feature_mat[:,0:submat_col_count] mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time extr_time_of_param[node_del_frac] = extr_time + mat_constr_time \ - time_to_subtract_of_param[node_del_frac] - sum(mat_constr_times) mat_constr_times.append(mat_constr_time) # x = 0 print('\nConvergence ratio: %.3f\n' % (conv_count / (conv_count + no_conv_count))) return feature_mat_of_param, extr_time_of_param
class_folders = utils.list_sub_dirs(SOURCE_CLASSES_PATH) compressed_graphs_count = 0 with open(join(SOURCE_CLASSES_PATH, 'hash_num_map.txt'), 'w') as f: for class_folder in class_folders: source_class_path = join(SOURCE_CLASSES_PATH, class_folder) target_class_path = join('pz', class_folder) os.makedirs(target_class_path) graph_file_names = utils.list_files(source_class_path) for graph_file_name in graph_file_names: id_to_num_mapper = utils.Id_to_num_mapper() G_uncompr = pz.load(join(source_class_path, graph_file_name)) if G_uncompr.number_of_nodes() == 0: print 'Warning! Graph ' + graph_file_name + ' has no nodes!' if G_uncompr.number_of_edges() == 0: print 'Warning! Graph ' + graph_file_name + ' has no edges!' G_compr = nx.DiGraph() id_to_num_mapper = utils.Id_to_num_mapper() # process nodes for node_id_tuple, lbl_dict in G_uncompr.node.iteritems(): node_id = '\n'.join(node_id_tuple) node_num = id_to_num_mapper.map_id_to_num(node_id)
graphs_of_class = dataset_loader.get_graphs_of_class_dict( graph_meta_data_of_num) classes = graphs_of_class.keys() # calculate statistics node_counts = [] edge_counts = [] degrees = [] min_deg = float('inf') max_deg = 0 number_of_isolated_nodes = 0 for graph_path, class_lbl in graph_meta_data_of_num.itervalues(): G = pz.load(graph_path) node_counts.append(G.number_of_nodes()) edge_counts.append(G.number_of_edges()) degrees.append(np.mean(G.degree().values())) if min(G.degree().values()) < min_deg: min_deg = min(G.degree().values()) if max(G.degree().values()) > max_deg: max_deg = max(G.degree().values()) for degree in G.degree().values(): if degree == 0: number_of_isolated_nodes += 1 avg_v = np.mean(node_counts)
def extract_features(graph_meta_data_of_num, h_range): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} mat_constr_times = [] h_max = max(h_range) # the keys are graph numbers and the values are lists of features features_dict = defaultdict(list) # the keys are graph numbers and the values are lists which contain the number # of occurences of the features corresponding to the feature at the same index # in the feature list in features_dict, that is # feature_counts_dict[graph_number][i] == number of occurences of feature # features_dict[graph_number][i] feature_counts_dict = defaultdict(list) # the keys are graph numbers and the values are dictionaries which map # features to their position in features_dict[graph_number] and # feature_counts_dict[graph_number], respectively idx_of_lbl_dict = defaultdict(dict) # the keys are graph numbers and the values are dictionaries which map # nodes to their updated label next_upd_lbls_dict = defaultdict(dict) upd_lbls_dict = defaultdict(dict) # keys are the node labels which are stored in the dataset and the values are # new compressed labels compr_func = {} # next_compr_lbl is used for assigning new compressed labels to the nodes # These build the features (= columns in feature_mat) used for the explicit # graph embedding next_compr_lbl = 0 #============================================================================= # 1) extract features iterating over all graphs in the dataset #============================================================================= for h in h_range: for graph_num, (graph_path, class_lbl) in\ graph_meta_data_of_num.iteritems(): # !! if graph_num % 100 == 0: print 'h = ' + str(h) + ', graph_num = ' + str(graph_num) # load graph G = pz.load(graph_path) for v in G.nodes_iter(): if h == 0: uncompr_lbl = G.node[v]['label'] if isinstance(uncompr_lbl, np.ndarray): uncompr_lbl = utils.calc_hash_of_array(uncompr_lbl) else: # r > 0 has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v)) if not has_elem: # node v has no neighbors next_upd_lbls_dict[graph_num][v] =\ upd_lbls_dict[graph_num][v] continue # determine the list of labels of the nodes adjacent to v nbrs_lbls = [] for v_nbr in nbrs_iter: nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr]) # sort nbrs_lbls in ascending order if len(nbrs_lbls) > 1: nbrs_lbls.sort() # concatenate the neighboring labels to the label of v uncompr_lbl = str(upd_lbls_dict[graph_num][v]) if len(nbrs_lbls) == 1: uncompr_lbl += ',' + str(nbrs_lbls[0]) elif len(nbrs_lbls) > 1: uncompr_lbl += ',' + ','.join(map(str, nbrs_lbls)) if not uncompr_lbl in compr_func: # assign a compressed label new_compr_lbl to uncompr_lbl new_compr_lbl = next_compr_lbl compr_func[uncompr_lbl] = new_compr_lbl next_compr_lbl += 1 else: # determine compressed label new_compr_lbl assigned to # uncompr_lbl new_compr_lbl = compr_func[uncompr_lbl] if new_compr_lbl not in idx_of_lbl_dict[graph_num]: # len(feature_counts_dict[graph_num]) # == len(features_dict[graph_num]) idx = len(feature_counts_dict[graph_num]) idx_of_lbl_dict[graph_num][new_compr_lbl] = idx # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl) features_dict[graph_num].append(new_compr_lbl) # set number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_compr_lbl) to 1 feature_counts_dict[graph_num].append(1) else: # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl) idx = idx_of_lbl_dict[graph_num][new_compr_lbl] # increase number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_compr_lbl) feature_counts_dict[graph_num][idx] += 1 if h < h_max: # next_upd_lbls_dict[graph_num][v] == compr_func[lbl] # == new_compr_lbl next_upd_lbls_dict[graph_num][v] = new_compr_lbl #========================================================================= # 2) construct data matrix whose i-th row equals the i-th feature vector, # which comprises the features of the first r iterations #========================================================================= mat_constr_start_time = time.time() # list containing the features of all graphs features = [] # list containing the corresponding features counts of all graphs feature_counts = [] # list indicating to which graph (= row in feature_mat) the features in # the list features belong. The difference # feature_ptr[i+1] - feature_ptr[i] equals the number of specified entries # for row i. Consequently, the number of rows of feature_mat equals # len(feature_ptr) - 1. feature_ptr = [0] for graph_num in graph_meta_data_of_num.iterkeys(): features += features_dict[graph_num] feature_counts += feature_counts_dict[graph_num] feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num])) # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] feature_mat = csr_matrix( (np.array(feature_counts), np.array(features), np.array(feature_ptr)), shape=(len(graph_meta_data_of_num), len(compr_func)), dtype=np.float64) feature_mat_of_param[h] = feature_mat extr_end_time = time.time() extr_time = extr_end_time - extr_start_time - sum(mat_constr_times) mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time mat_constr_times.append(mat_constr_time) extr_time += mat_constr_time extr_time_of_param[h] = extr_time if h < h_max: upd_lbls_dict = next_upd_lbls_dict next_upd_lbls_dict = defaultdict(dict) return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, node_del_fracs): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} time_to_subtract_of_param = defaultdict(int) mat_constr_times = [] num_graphs = len(graph_meta_data_of_num) avg_nodes_count = get_avg_nodes_count(graph_meta_data_of_num) # max_nodes_count = get_max_nodes_count(graph_meta_data_of_num) # avg_nodes_count = get_max_nodes_count(graph_meta_data_of_num) feature_mat = np.zeros((num_graphs, int(avg_nodes_count)), dtype=np.float64) submat_col_count_of_node_del_frac = {} for node_del_frac in node_del_fracs: submat_col_count_of_node_del_frac[node_del_frac] \ = int(node_del_frac * avg_nodes_count) node_del_fracs_desc_order = sorted(node_del_fracs, reverse=True) # first_eig_val_no_conv = False conv_count = 0 no_conv_count = 0 #============================================================================= # 1) extract features iterating over all graphs in the dataset #============================================================================= for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # !! # if i % 10 == 0: # print i # load graph G = pz.load(graph_path) # import sys # sys.modules['__main__'].G = G # determine its adjacency matrix # A = utils.get_adjacency_matrix(G) A = nx.adj_matrix(G, weight=None).astype('d') # calculate adjacency matrix of the undirected version of G if nx.is_directed(G): A = A + A.T # import sys # sys.modules['__main__'].A = A nodes_count = len(G.node) upd_row_idx_of_orig_row_idx = dict( izip(xrange(nodes_count), xrange(nodes_count))) # get pairs (node_num, degree) sorted by degree in ascending order node_num_degree_pairs = get_node_num_degree_pairs(G) j = 0 last_j = -1 speed = 1 while j < min(nodes_count, int(avg_nodes_count)): # while j < nodes_count: sys.stdout.write('i = ' + str(i) + ' (|V| = ' + str(nodes_count)\ + '), j = ' + str(j) + ': ') inner_loop_start_time = time.time() # store largest eigenvalue of A in feature matrix # feature_mat[i,j] = eigvalsh(A)[-1] try: feature_mat[i, j] = eigsh(A, which='LA', k=1, maxiter=20 * A.shape[0], return_eigenvectors=False) # feature_mat[i,j] = eigs(A, which = 'LR', k = 1, # maxiter = 20*A.shape[0], # return_eigenvectors = False) # algorithm converged print(str(feature_mat[i, j])) # if first_eig_val_no_conv: # feature_mat[i, :j] = feature_mat[i, j] # first_eig_val_no_conv = False if j == 0: last_j = 0 conv_count += 1 except (ArpackError, ArpackNoConvergence): # if j == 0: # first_eig_val_no_conv = True # else: if j > 0: feature_mat[i, j] = feature_mat[i, j - 1] print(str(feature_mat[i, j - 1]) + ' [NO CONVERGENCE]') no_conv_count += 1 if last_j < 0: # no iteration with convergence so far if j > 0: speed *= 2 else: feature_mat[i, last_j + 1:j] = feature_mat[i, j] if abs(feature_mat[i, j] - feature_mat[i, last_j]) > 1e-5: last_j = j speed = 1 else: if j > 0: speed *= 2 # determine the node number, which corresponds to the node with # smallest degree, and remove the corresponding row and column of # the (original) adjacency matrix of G # !! better mathematical term for k in xrange(j, min(j + speed, nodes_count, int(avg_nodes_count))): if A.shape[0] <= 2: break node_num_smallest_deg = node_num_degree_pairs[k][0] del_idx = upd_row_idx_of_orig_row_idx[node_num_smallest_deg] A = del_row_and_col_at_idx(A, del_idx) upd_row_idx_of_orig_row_idx = update_row_idxs( upd_row_idx_of_orig_row_idx, node_num_smallest_deg) inner_loop_end_time = time.time() inner_loop_time = inner_loop_end_time - inner_loop_start_time for node_del_frac in node_del_fracs_desc_order: if k >= submat_col_count_of_node_del_frac[node_del_frac]: time_to_subtract_of_param[node_del_frac] \ += inner_loop_time else: break if A.shape[0] <= 2: break if (j < min(nodes_count, int(avg_nodes_count)) - 1) \ and (j + speed) >= min(nodes_count, int(avg_nodes_count)): feature_mat[i, j + 1:] = feature_mat[i, j] j += speed # !! # import sys # sys.modules['__main__'].G = G # sys.modules['__main__'].A = A # sys.modules['__main__'].F = feature_mat # x = 0 # eigvalsh(A) # # for j in xrange(feature_mat.shape[1]): # largest_eigen_val = eigvalsh(A)[-1] # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] # feature_mat = csr_matrix((np.array(feature_counts), np.array(features), # np.array(feature_ptr)), # shape = (len(graph_meta_data_of_num), # len(compr_func)), dtype = np.float64) extr_end_time = time.time() extr_time = extr_end_time - extr_start_time mat_constr_start_time = time.time() for node_del_frac in node_del_fracs: mat_constr_start_time = time.time() submat_col_count = submat_col_count_of_node_del_frac[node_del_frac] feature_mat_of_param[node_del_frac] \ = feature_mat[:,0:submat_col_count] mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time extr_time_of_param[node_del_frac] = extr_time + mat_constr_time \ - time_to_subtract_of_param[node_del_frac] - sum(mat_constr_times) mat_constr_times.append(mat_constr_time) # x = 0 print('\nConvergence ratio: %.3f\n' % (conv_count / (conv_count + no_conv_count))) return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, param_range = [None]): extr_start_time = time.time() # the keys are graph numbers and the values are lists of features features_dict = defaultdict(list) # the keys are graph numbers and the values are lists which contain the number # of occurences of the features corresponding to the feature at the same index # in the feature list in features_dict, that is # feature_counts_dict[graph_number][i] == number of occurences of feature # features_dict[graph_number][i] feature_counts_dict = defaultdict(list) # the keys are graph numbers and the values are dictionaries which map # features to their position in features_dict[graph_number] and # feature_counts_dict[graph_number], respectively idx_of_lbl_dict = defaultdict(dict) # the keys are graph numbers and the values are dictionaries which map # nodes to their updated label upd_lbls_dict = defaultdict(dict) # keys are the node labels which are stored in the dataset and the values are # new compressed labels compr_func = {} # next_compr_lbl is used for assigning new compressed labels to the nodes # These build the features (= columns in feature_mat) used for the explicit # graph embedding next_compr_lbl = 0 # iterate over all graphs in the dataset ------------------------------------- # r == 0 for graph_num, (graph_path, class_lbl) in graph_meta_data_of_num.iteritems(): G = pz.load(graph_path) for v in G: uncompr_lbl = G.node[v]['label'] if not uncompr_lbl in compr_func: # assign a compressed label new_compr_lbl to uncompr_lbl new_compr_lbl = next_compr_lbl compr_func[uncompr_lbl] = new_compr_lbl next_compr_lbl += 1 else: # determine compressed label new_compr_lbl assigned to # uncompr_lbl new_compr_lbl = compr_func[uncompr_lbl] if new_compr_lbl not in idx_of_lbl_dict[graph_num]: # len(feature_counts_dict[graph_num]) # == len(features_dict[graph_num]) idx = len(feature_counts_dict[graph_num]) idx_of_lbl_dict[graph_num][new_compr_lbl] = idx # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl) features_dict[graph_num].append(new_compr_lbl) # increase number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_compr_lbl) feature_counts_dict[graph_num].append(1) else: # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl) idx = idx_of_lbl_dict[graph_num][new_compr_lbl] # increase number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_compr_lbl) feature_counts_dict[graph_num][idx] += 1 # upd_lbls_dict[graph_num][v] == compr_func[lbl] # == new_compr_lbl upd_lbls_dict[graph_num][v] = new_compr_lbl # list containing the features of all graphs features = [] # list containing the corresponding features counts of all graphs feature_counts = [] # list indicating to which graph (= row in feature_mat) the features in the # list features belong. The difference feature_ptr[i+1] - feature_ptr[i] # equals the number of specified entries for row i. Consequently, the number # of rows of feature_mat equals len(feature_ptr) - 1. feature_ptr = [0] for graph_num in graph_meta_data_of_num.iterkeys(): features += features_dict[graph_num] feature_counts += feature_counts_dict[graph_num] feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num])) # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] feature_mat = csr_matrix((np.array(feature_counts), np.array(features), np.array(feature_ptr)), shape = (len(graph_meta_data_of_num), len(compr_func)), dtype = np.float64) extr_end_time = time.time() extr_time = extr_end_time - extr_start_time # !! DEBUG # Z = feature_mat.todense() return {None: feature_mat}, {None: extr_time}
def extract_features(graph_meta_data_of_num, graphlet_size = 4): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} graphlets_count = 0 if graphlet_size == 3: graphlets_count = 4 elif graphlet_size == 4: graphlets_count = 11 # initialize feature matrix graphs_count = len(graph_meta_data_of_num) feature_mat = np.zeros((graphs_count, graphlets_count), dtype = np.float64) #============================================================================= # extract features iterating over all graphs in the dataset #============================================================================= for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # load graph G = pz.load(graph_path) nodes_count = len(G.node) if graphlet_size == 3: # count 3-graphlets # counts[i] finally holds the number of the graphlet g_(i + 1), # i = 0,...,3 (see Figure !!) counts = np.zeros(4, np.float64) weights = np.array([6, 4, 2], np.float64) for v1 in G.nodes_iter(): has_elem, nbr_iter = utils.has_elem(G.neighbors_iter(v1)) if not has_elem: # node v1 has no neighbors continue v1_nbrs = set(G.neighbors(v1)) for v2 in v1_nbrs: v2_nbrs = set(G.neighbors(v2)) counts[0] += len(v1_nbrs & v2_nbrs) counts[1] += len(v1_nbrs - (v2_nbrs | {v2})) counts[1] += len(v2_nbrs - (v1_nbrs | {v1})) counts[2] += nodes_count - len(v1_nbrs | v2_nbrs) counts[:3] /= weights counts[3] = comb(nodes_count, 3) - sum(counts) feature_mat[i] = counts elif graphlet_size == 4: # count 4-graphlets # c[i] finally holds the number of the graphlet g_(i + 1), # i = 0,...,10 (see Figure !!) counts = np.zeros(11, np.float64) weights = np.array([1/12, 1/10, 1/8, 1/6, 1/8, 1/6, 1/6, 1/4, 1/4, 1/2, 0], np.float64) # each undirected edge is only counted once edges_count = G.number_of_edges() for v1 in G.nodes_iter(): has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v1)) if not has_elem: # node v1 has no neighbors continue v1_nbrs = set(G.neighbors(v1)) for v2 in v1_nbrs: K = 0 tmp_counts = np.zeros(11, np.float64) v2_nbrs = set(G.neighbors(v2)) v1_nbrs_inter_v2_nbrs = v1_nbrs & v2_nbrs v1_nbrs_minus_v2_nbrs = v1_nbrs - v2_nbrs v2_nbrs_minus_v1_nbrs = v2_nbrs - v1_nbrs for v3 in v1_nbrs_inter_v2_nbrs: v3_nbrs = set(G.neighbors(v3)) cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs) tmp_counts[0] += 1/2*cards[6] tmp_counts[1] += 1/2*(cards[3] - 1) tmp_counts[1] += 1/2*(cards[4] - 1) tmp_counts[1] += 1/2*(cards[5] - 1) tmp_counts[2] += 1/2*cards[0] tmp_counts[2] += 1/2*cards[1] tmp_counts[2] += cards[2] tmp_counts[6] += nodes_count - sum(cards) K += 1/2*cards[6] + 1/2*(cards[4] - 1) \ + 1/2*(cards[5] - 1) + cards[2] for v3 in v1_nbrs_minus_v2_nbrs - {v2}: v3_nbrs = set(G.neighbors(v3)) cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs) tmp_counts[1] += 1/2*cards[6] tmp_counts[2] += 1/2*cards[3] tmp_counts[2] += 1/2*cards[4] tmp_counts[4] += 1/2*(cards[5] - 1) tmp_counts[3] += 1/2*(cards[0] - 2) tmp_counts[5] += 1/2*cards[1] tmp_counts[5] += cards[2] tmp_counts[7] += nodes_count - sum(cards) K += 1/2*cards[6] + 1/2*cards[4] \ + 1/2*(cards[5] - 1) + cards[2] for v3 in v2_nbrs_minus_v1_nbrs - {v1}: v3_nbrs = set(G.neighbors(v3)) cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs) tmp_counts[1] += 1/2*cards[6] tmp_counts[2] += 1/2*cards[3] tmp_counts[4] += 1/2*(cards[4] - 1) tmp_counts[2] += 1/2*cards[5] tmp_counts[5] += 1/2*cards[0] tmp_counts[3] += 1/2*(cards[1] - 2) tmp_counts[5] += cards[2] tmp_counts[7] += nodes_count - sum(cards) K += 1/2*cards[6] + 1/2*(cards[4] - 1) \ + 1/2*cards[5] + cards[2] tmp_counts[8] += edges_count + 1 - len(v1_nbrs) \ - len(v2_nbrs) - K tmp_counts[9] += (nodes_count \ - len(v1_nbrs_inter_v2_nbrs) \ - len(v1_nbrs_minus_v2_nbrs) \ - len(v2_nbrs_minus_v1_nbrs)) \ * (nodes_count \ - len(v1_nbrs_inter_v2_nbrs) - len(v1_nbrs_minus_v2_nbrs) - len(v2_nbrs_minus_v1_nbrs) - 1)/2 \ - (edges_count + 1 - len(v1_nbrs) \ - len(v2_nbrs) - K) counts += tmp_counts * weights counts[10] = comb(nodes_count, 4) - sum(counts[:10]) feature_mat[i] = counts feature_mat_of_param[None] = feature_mat extr_end_time = time.time() extr_time_of_param[None] = extr_end_time - extr_start_time return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, h_range, count_sensitive=True, all_iter=False): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} mat_constr_times = [] h_max = max(h_range) BIT_LBL_LEN = 16 # rotate left rot_left = lambda val, r_bits: \ (val << r_bits % BIT_LBL_LEN) & (2**BIT_LBL_LEN - 1) | \ ((val & (2**BIT_LBL_LEN - 1)) >> (BIT_LBL_LEN - (r_bits % BIT_LBL_LEN))) # the keys are graph numbers and the values are lists of features features_dict = defaultdict(list) # the keys are graph numbers and the values are lists which contain the number # of occurences of the features corresponding to the feature at the same index # in the feature list in features_dict, that is # feature_counts_dict[graph_number][i] == number of occurences of feature # features_dict[graph_number][i] feature_counts_dict = defaultdict(list) # the keys are graph numbers and the values are dictionaries which map # features to their position in features_dict[graph_number] and # feature_counts_dict[graph_number], respectively idx_of_lbl_dict = defaultdict(dict) # the keys are graph numbers and the values are dictionaries which map # nodes to their updated label next_upd_lbls_dict = defaultdict(dict) upd_lbls_dict = defaultdict(dict) # keys are the node labels which are stored in the dataset and the values are # 64-bit integers label_map = {} #============================================================================= # 1) extract features iterating over all graphs in the dataset #============================================================================= for h in h_range: for graph_num, (graph_path, class_lbl) in\ graph_meta_data_of_num.iteritems(): # !! if graph_num % 100 == 0: print 'h = ' + str(h) + ', graph_num = ' + str(graph_num) # load graph G = pz.load(graph_path) for v in G.nodes_iter(): if h == 0: orig_lbl = G.node[v]['label'] if isinstance(orig_lbl, np.ndarray): orig_lbl = utils.calc_hash_of_array(orig_lbl) if not orig_lbl in label_map.iterkeys(): # assign a random bit label new_bit_lbl to orig_lbl new_bit_lbl = randint(1, 2**BIT_LBL_LEN - 1) label_map[orig_lbl] = new_bit_lbl else: # determine bit label new_bit_lbl assigned to orig_lbl new_bit_lbl = label_map[orig_lbl] else: # h > 0 has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v)) if not has_elem: # node v has no neighbors next_upd_lbls_dict[graph_num][v] =\ upd_lbls_dict[graph_num][v] continue if not count_sensitive: # apply simple neighborhood hash new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1) for v_nbr in nbrs_iter: new_bit_lbl ^= upd_lbls_dict[graph_num][v_nbr] else: # determine the list of labels of the nodes adjacent to v nbrs_lbls = [] for v_nbr in nbrs_iter: nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr]) # determine the number of occurences of each neighbor # label num_of_nbr_lbl = {} if len(nbrs_lbls) == 1: nbr_lbl = nbrs_lbls[0] num_of_nbr_lbl[nbr_lbl] = 1 else: # len(nbrs_lbls) > 1 # sort nbrs_lbls in ascending order nbrs_lbls.sort() prev_nbr_lbl = nbrs_lbls[0] c = 1 for nbr_lbl in nbrs_lbls[1:]: if nbr_lbl == prev_nbr_lbl: c += 1 else: num_of_nbr_lbl[prev_nbr_lbl] = c prev_nbr_lbl = nbr_lbl c = 1 num_of_nbr_lbl[nbr_lbl] = c # apply count sensitive neighborhood hash new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1) for nbr_lbl, num in num_of_nbr_lbl.iteritems(): new_bit_lbl ^= rot_left(nbr_lbl ^ num, num) if h < h_max: # next_upd_lbls_dict[graph_num][v] == label_map[lbl] # == new_bit_lbl next_upd_lbls_dict[graph_num][v] = new_bit_lbl if new_bit_lbl not in idx_of_lbl_dict[graph_num]: # len(feature_counts_dict[graph_num]) # == len(features_dict[graph_num]) idx = len(feature_counts_dict[graph_num]) idx_of_lbl_dict[graph_num][new_bit_lbl] = idx # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl) features_dict[graph_num].append(new_bit_lbl) # set number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_bit_lbl) to 1 feature_counts_dict[graph_num].append(1) else: # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl) idx = idx_of_lbl_dict[graph_num][new_bit_lbl] # increase number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_bit_lbl) feature_counts_dict[graph_num][idx] += 1 #========================================================================= # 2) compress bit labels and construct data matrix whose i-th row equals # the i-th feature vector #========================================================================= mat_constr_start_time = time.time() # list containing the features of all graphs features = [] # list containing the corresponding features counts of all graphs feature_counts = [] # list indicating to which graph (= row in feature_mat) the features in # the list features belong. The difference # feature_ptr[i+1] - feature_ptr[i] equals the number of specified entries # for row i. Consequently, the number of rows of feature_mat equals # len(feature_ptr) - 1. feature_ptr = [0] # keys are the bit labels and the values are new compressed labels compr_func = {} # next_compr_lbl is used for assigning new compressed labels to the nodes. # These build the features (= columns in feature_mat), which are used for # the explicit graph graph embedding. next_compr_lbl = 0 for graph_num in graph_meta_data_of_num.iterkeys(): for bit_lbl, bit_lbl_count in\ itools.izip(features_dict[graph_num], feature_counts_dict[graph_num]): if not bit_lbl in compr_func: compr_func[bit_lbl] = next_compr_lbl compr_lbl = next_compr_lbl next_compr_lbl += 1 else: compr_lbl = compr_func[bit_lbl] features.append(compr_lbl) feature_counts.append(bit_lbl_count) feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num])) # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] feature_mat = csr_matrix((np.array(feature_counts), np.array(features), np.array(feature_ptr)), dtype=np.float64) feature_mat_of_param[h] = feature_mat extr_end_time = time.time() extr_time = extr_end_time - extr_start_time - sum(mat_constr_times) mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time mat_constr_times.append(mat_constr_time) extr_time += mat_constr_time extr_time_of_param[h] = extr_time if h < h_max: upd_lbls_dict = next_upd_lbls_dict next_upd_lbls_dict = defaultdict(dict) if not all_iter: features_dict = defaultdict(list) feature_counts_dict = defaultdict(list) idx_of_lbl_dict = defaultdict(dict) return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, node_del_fracs): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} time_to_subtract_of_param = defaultdict(int) mat_constr_times = [] num_graphs = len(graph_meta_data_of_num) avg_nodes_count = get_avg_nodes_count(graph_meta_data_of_num) feature_mat = np.zeros((num_graphs, int(avg_nodes_count)), dtype = np.float64) submat_col_count_of_node_del_frac = {} for node_del_frac in node_del_fracs: submat_col_count_of_node_del_frac[node_del_frac] \ = int(node_del_frac * avg_nodes_count) conv_count = 0 no_conv_count = 0 #========================================================================== # 1) extract features iterating over all graphs in the dataset #========================================================================== for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # load graph G = pz.load(graph_path) # determine its adjacency matrix A = nx.adj_matrix(G, weight = None).astype('d') # calculate adjacency matrix of the undirected version of G if nx.is_directed(G): A = A + A.T A[A > 1] = 1 nodes_count = len(G.node) upd_row_idx_of_orig_row_idx = dict(izip(xrange(nodes_count), xrange(nodes_count))) # get pairs (node_num, degree) sorted by degree in ascending order node_num_degree_pairs = get_node_num_degree_pairs(G) j = 0 last_j = -1 speed = 1 while j < min(nodes_count, int(avg_nodes_count)): sys.stdout.write('i = ' + str(i) + ' (|V| = ' + str(nodes_count) \ + '), j = ' + str(j) + ': ') inner_loop_start_time = time.time() # add largest eigenvalue of A to the i-th feature vector try: feature_mat[i, j] = eigsh(A, which = 'LA', k = 1, maxiter = 20*A.shape[0], return_eigenvectors = False) # algorithm converged print(str(feature_mat[i,j])) if j == 0: last_j = 0 conv_count += 1 except (ArpackError, ArpackNoConvergence): if j > 0: feature_mat[i, j] = feature_mat[i, j - 1] print(str(feature_mat[i, j - 1]) + ' [NO CONVERGENCE]') no_conv_count += 1 if last_j < 0: # no iteration with convergence so far if j > 0: speed *= 2 else: # "interpolate" at the skipped dimensions of the i-th feature # vector feature_mat[i, last_j + 1: j] = feature_mat[i, j] if abs(feature_mat[i, j] - feature_mat[i, last_j]) > 1e-5: last_j = j speed = 1 else: # abs(feature_mat[i, j] - feature_mat[i, last_j]) <= 1e-5 if j > 0: # double the speed in order to avoid unnecessary # eigenvalue computations speed *= 2 inner_loop_end_time = time.time() inner_loop_time = inner_loop_end_time - inner_loop_start_time for node_del_frac in sorted(node_del_fracs): if j >= submat_col_count_of_node_del_frac[node_del_frac]: time_to_subtract_of_param[node_del_frac] \ += inner_loop_time else: break # determine the node number, which corresponds to the node with # smallest degree, and remove the corresponding row and column of # the (original) adjacency matrix of G for k in xrange(j, min(j + speed, nodes_count, int(avg_nodes_count))): if A.shape[0] <= 2: break inner_loop_start_time = time.time() node_num_smallest_deg = node_num_degree_pairs[k][0] del_idx = upd_row_idx_of_orig_row_idx[node_num_smallest_deg] A = del_row_and_col_at_idx(A, del_idx) upd_row_idx_of_orig_row_idx = update_row_idxs( upd_row_idx_of_orig_row_idx, node_num_smallest_deg) inner_loop_end_time = time.time() inner_loop_time = inner_loop_end_time - inner_loop_start_time for node_del_frac in sorted(node_del_fracs): if k >= submat_col_count_of_node_del_frac[node_del_frac]: time_to_subtract_of_param[node_del_frac] \ += inner_loop_time else: break if A.shape[0] <= 2: break if (j < min(nodes_count, int(avg_nodes_count)) - 1) \ and (j + speed) >= min(nodes_count, int(avg_nodes_count)): # "interpolate" at the last dimensions of the i-th feature # vector feature_mat[i, j + 1:] = feature_mat[i, j] j += speed extr_end_time = time.time() extr_time = extr_end_time - extr_start_time mat_constr_start_time = time.time() for node_del_frac in node_del_fracs: mat_constr_start_time = time.time() submat_col_count = submat_col_count_of_node_del_frac[node_del_frac] feature_mat_of_param[node_del_frac] \ = feature_mat[:, :submat_col_count] mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time extr_time_of_param[node_del_frac] = extr_time + mat_constr_time \ - time_to_subtract_of_param[node_del_frac] - sum(mat_constr_times) mat_constr_times.append(mat_constr_time) print('\nConvergence ratio: %.3f\n' % (conv_count / (conv_count + no_conv_count))) return feature_mat_of_param, extr_time_of_param
import inspect import sys from os.path import abspath, dirname, join # determine script path SCRIPT_PATH = inspect.getframeinfo(inspect.currentframe()).filename SCRIPT_FOLDER_PATH = dirname(abspath(SCRIPT_PATH)) # modify the search path for modules in order to access modules in subfolders # of the script's parent directory sys.path.append(join(SCRIPT_FOLDER_PATH, '..', '..')) from misc import dataset_loader, pz DATASETS_PATH = join(SCRIPT_FOLDER_PATH, '..', '..', '..', 'datasets') # dataset = 'MUTAG' # dataset = 'DD' dataset = 'ENZYMES' # dataset = 'NCI1' # dataset = 'NCI109' graph_meta_data_of_num, class_lbls \ = dataset_loader.get_graph_meta_data_and_class_lbls(dataset, DATASETS_PATH) f = open('python_edges_count_of_each_graph.csv', 'w') for graph_num, (graph_path, class_lbl) in graph_meta_data_of_num.iteritems(): G = pz.load(graph_path) f.write(str(graph_num) + '; ' + str(2*G.number_of_edges()) + '\n') f.close()
from misc import pz ORANGE = '#FF6600' DARK_BLUE = '#3F3D99' # GRAPH_NAME = "android_fcg_7ab" # This graph has 32635 nodes. # GRAPH_NAME = "dd_class1_1" # This graph has 327 nodes and 899 edges. # GRAPH_NAME = "enzymes_class1_201" # This graph has 29 nodes and 53 edges. GRAPH_NAME = "mutag_class1_1" # This graph has 23 nodes and 27 edges. # GRAPH_NAME = "nc1_class0_1" # This graph has 21 nodes and 21 edges. # GRAPH_NAME = "nci109_class0_1" # This graph has 21 nodes and 21 edges. G = pz.load(GRAPH_NAME + ".pz") print('number of nodes: ' + str(G.number_of_nodes())) print('number of edges: ' + str(G.number_of_edges())) ax = plt.axes(frameon = True) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) # nc1_class0_1 and nci109_class0_1 (21 nodes and 21 edges) ==================== # k controls the distance between the nodes and varies between 0 and 1 # iterations is the number of times simulated annealing is run # default k = 0.1 and iterations = 50 #pos = nx.spring_layout(G, k = 0.1, iterations = 10000)
def extract_features(graph_meta_data_of_num, graphlet_size = 4): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} graphlets_count = 0 if graphlet_size == 3: graphlets_count = 4 elif graphlet_size == 4: graphlets_count = 11 # initialize feature matrix graphs_count = len(graph_meta_data_of_num) feature_mat = np.zeros((graphs_count, graphlets_count), dtype = np.float64) #========================================================================== # extract features iterating over all graphs in the dataset #========================================================================== for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # !! if i % 10 == 0: print 'i =', i # load graph G = pz.load(graph_path) nodes_count = len(G.node) if graphlet_size == 3: # count 3-graphlets # The array counts finally holds the counts of the respective # graphlets of size 3. counts = np.zeros(4, np.float64) weights = np.array([6, 4, 2], np.float64) for v1 in G.nodes_iter(): has_elem, nbr_iter = utils.has_elem(G.neighbors_iter(v1)) if not has_elem: # node v1 has no neighbors continue v1_nbrs = set(G.neighbors(v1)) for v2 in v1_nbrs: v2_nbrs = set(G.neighbors(v2)) counts[0] += len(v1_nbrs & v2_nbrs) counts[1] += len(v1_nbrs - (v2_nbrs | {v2})) counts[1] += len(v2_nbrs - (v1_nbrs | {v1})) counts[2] += nodes_count - len(v1_nbrs | v2_nbrs) counts[:3] /= weights counts[3] = comb(nodes_count, 3) - sum(counts) feature_mat[i] = counts elif graphlet_size == 4: # count 4-graphlets # The array counts finally holds the counts of the respective # graphlets of size 4. counts = np.zeros(11, np.float64) weights = np.array([1/12, 1/10, 1/8, 1/6, 1/8, 1/6, 1/6, 1/4, 1/4, 1/2, 0], np.float64) # each undirected edge is only counted once edges_count = G.number_of_edges() for v1 in G.nodes_iter(): has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v1)) if not has_elem: # node v1 has no neighbors continue v1_nbrs = set(G.neighbors(v1)) for v2 in v1_nbrs: K = 0 tmp_counts = np.zeros(11, np.float64) v2_nbrs = set(G.neighbors(v2)) v1_nbrs_inter_v2_nbrs = v1_nbrs & v2_nbrs v1_nbrs_minus_v2_nbrs = v1_nbrs - v2_nbrs v2_nbrs_minus_v1_nbrs = v2_nbrs - v1_nbrs for v3 in v1_nbrs_inter_v2_nbrs: v3_nbrs = set(G.neighbors(v3)) cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs) tmp_counts[0] += 1/2*cards[6] tmp_counts[1] += 1/2*(cards[3] - 1) tmp_counts[1] += 1/2*(cards[4] - 1) tmp_counts[1] += 1/2*(cards[5] - 1) tmp_counts[2] += 1/2*cards[0] tmp_counts[2] += 1/2*cards[1] tmp_counts[2] += cards[2] tmp_counts[6] += nodes_count - sum(cards) K += 1/2*cards[6] + 1/2*(cards[4] - 1) \ + 1/2*(cards[5] - 1) + cards[2] for v3 in v1_nbrs_minus_v2_nbrs - {v2}: v3_nbrs = set(G.neighbors(v3)) cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs) tmp_counts[1] += 1/2*cards[6] tmp_counts[2] += 1/2*cards[3] tmp_counts[2] += 1/2*cards[4] tmp_counts[4] += 1/2*(cards[5] - 1) tmp_counts[3] += 1/2*(cards[0] - 2) tmp_counts[5] += 1/2*cards[1] tmp_counts[5] += cards[2] tmp_counts[7] += nodes_count - sum(cards) K += 1/2*cards[6] + 1/2*cards[4] \ + 1/2*(cards[5] - 1) + cards[2] for v3 in v2_nbrs_minus_v1_nbrs - {v1}: v3_nbrs = set(G.neighbors(v3)) cards = calc_cards(v1_nbrs, v2_nbrs, v3_nbrs) tmp_counts[1] += 1/2*cards[6] tmp_counts[2] += 1/2*cards[3] tmp_counts[4] += 1/2*(cards[4] - 1) tmp_counts[2] += 1/2*cards[5] tmp_counts[5] += 1/2*cards[0] tmp_counts[3] += 1/2*(cards[1] - 2) tmp_counts[5] += cards[2] tmp_counts[7] += nodes_count - sum(cards) K += 1/2*cards[6] + 1/2*(cards[4] - 1) \ + 1/2*cards[5] + cards[2] tmp_counts[8] += edges_count + 1 - len(v1_nbrs) \ - len(v2_nbrs) - K tmp_counts[9] += (nodes_count \ - len(v1_nbrs_inter_v2_nbrs) \ - len(v1_nbrs_minus_v2_nbrs) \ - len(v2_nbrs_minus_v1_nbrs)) \ * (nodes_count \ - len(v1_nbrs_inter_v2_nbrs) - len(v1_nbrs_minus_v2_nbrs) - len(v2_nbrs_minus_v1_nbrs) - 1)/2 \ - (edges_count + 1 - len(v1_nbrs) \ - len(v2_nbrs) - K) counts += tmp_counts * weights counts[10] = comb(nodes_count, 4) - sum(counts[:10]) feature_mat[i] = counts feature_mat_of_param[None] = feature_mat extr_end_time = time.time() extr_time_of_param[None] = extr_end_time - extr_start_time return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, h_range, count_sensitive = True, all_iter = False): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} mat_constr_times = [] h_max = max(h_range) BIT_LBL_LEN = 24 # rotate left rot_left = lambda val, r_bits: \ (val << r_bits % BIT_LBL_LEN) & (2**BIT_LBL_LEN - 1) | \ ((val & (2**BIT_LBL_LEN - 1)) >> (BIT_LBL_LEN \ - (r_bits % BIT_LBL_LEN))) # the keys are graph numbers and the values are lists of features features_dict = defaultdict(list) # the keys are graph numbers and the values are lists which contain the # number of occurences of the features corresponding to the feature at the # same index in the feature list in features_dict, that is # feature_counts_dict[graph_number][i] == number of occurences of feature # features_dict[graph_number][i] feature_counts_dict = defaultdict(list) # the keys are graph numbers and the values are dictionaries which map # features to their position in features_dict[graph_number] and # feature_counts_dict[graph_number], respectively idx_of_lbl_dict = defaultdict(dict) # the keys are graph numbers and the values are dictionaries which map # nodes to their updated label next_upd_lbls_dict = defaultdict(dict) upd_lbls_dict = defaultdict(dict) # keys are the node labels which are stored in the dataset and the values # are 64-bit integers label_map = {} #========================================================================== # 1) extract features iterating over all graphs in the dataset #========================================================================== for h in h_range: for graph_num, (graph_path, class_lbl) in \ graph_meta_data_of_num.iteritems(): # !! if graph_num % 100 == 0: print 'h = ' + str(h) + ', graph_num = ' + str(graph_num) # load graph G = pz.load(graph_path) for v in G.nodes_iter(): if h == 0: orig_lbl = G.node[v]['label'] if isinstance(orig_lbl, np.ndarray): orig_lbl = utils.calc_hash_of_array(orig_lbl) if not orig_lbl in label_map.iterkeys(): # assign a random bit label new_bit_lbl to orig_lbl new_bit_lbl = randint(1, 2**BIT_LBL_LEN - 1) label_map[orig_lbl] = new_bit_lbl else: # determine bit label new_bit_lbl assigned to orig_lbl new_bit_lbl = label_map[orig_lbl] else: # h > 0 has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v)) if not has_elem: # node v has no neighbors next_upd_lbls_dict[graph_num][v] \ = upd_lbls_dict[graph_num][v] continue if not count_sensitive: # apply simple neighborhood hash new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1) for v_nbr in nbrs_iter: new_bit_lbl ^= upd_lbls_dict[graph_num][v_nbr] else: # determine the list of labels of the nodes adjacent to # v nbrs_lbls = [] for v_nbr in nbrs_iter: nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr]) # determine the number of occurences of each neighbor # label num_of_nbr_lbl = {} if len(nbrs_lbls) == 1: nbr_lbl = nbrs_lbls[0] num_of_nbr_lbl[nbr_lbl] = 1 else: # len(nbrs_lbls) > 1 # sort nbrs_lbls in ascending order nbrs_lbls.sort() prev_nbr_lbl = nbrs_lbls[0] c = 1 for nbr_lbl in nbrs_lbls[1:]: if nbr_lbl == prev_nbr_lbl: c += 1 else: num_of_nbr_lbl[prev_nbr_lbl] = c prev_nbr_lbl = nbr_lbl c = 1 num_of_nbr_lbl[nbr_lbl] = c # apply count sensitive neighborhood hash new_bit_lbl = rot_left(upd_lbls_dict[graph_num][v], 1) for nbr_lbl, num in num_of_nbr_lbl.iteritems(): new_bit_lbl ^= rot_left(nbr_lbl ^ num, num) if h < h_max: # next_upd_lbls_dict[graph_num][v] == label_map[lbl] # == new_bit_lbl next_upd_lbls_dict[graph_num][v] = new_bit_lbl if new_bit_lbl not in idx_of_lbl_dict[graph_num]: # len(feature_counts_dict[graph_num]) # == len(features_dict[graph_num]) idx = len(feature_counts_dict[graph_num]) idx_of_lbl_dict[graph_num][new_bit_lbl] = idx # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl) features_dict[graph_num].append(new_bit_lbl) # set number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_bit_lbl) to 1 feature_counts_dict[graph_num].append(1) else: # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_bit_lbl) idx = idx_of_lbl_dict[graph_num][new_bit_lbl] # increase number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_bit_lbl) feature_counts_dict[graph_num][idx] += 1 #====================================================================== # 2) compress bit labels and construct data matrix whose i-th row # equals the i-th feature vector #====================================================================== mat_constr_start_time = time.time() # list containing the features of all graphs features = [] # list containing the corresponding features counts of all graphs feature_counts = [] # list indicating to which graph (= row in feature_mat) the features in # the list features belong. The difference # feature_ptr[i+1] - feature_ptr[i] equals the number of specified # entries for row i. Consequently, the number of rows of feature_mat # equals len(feature_ptr) - 1. feature_ptr = [0] # keys are the bit labels and the values are new compressed labels compr_func = {} # next_compr_lbl is used for assigning new compressed labels to the # nodes. These build the features (= columns in feature_mat), which are # used for the explicit graph graph embedding. next_compr_lbl = 0 for graph_num in graph_meta_data_of_num.iterkeys(): for bit_lbl, bit_lbl_count in\ itools.izip(features_dict[graph_num], feature_counts_dict[graph_num]): if not bit_lbl in compr_func: compr_func[bit_lbl] = next_compr_lbl compr_lbl = next_compr_lbl next_compr_lbl += 1 else: compr_lbl = compr_func[bit_lbl] features.append(compr_lbl) feature_counts.append(bit_lbl_count) feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num])) # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] feature_mat = csr_matrix((np.array(feature_counts), np.array(features), np.array(feature_ptr)), dtype = np.float64) feature_mat_of_param[h] = feature_mat extr_end_time = time.time() extr_time = extr_end_time - extr_start_time - sum(mat_constr_times) mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time mat_constr_times.append(mat_constr_time) extr_time += mat_constr_time extr_time_of_param[h] = extr_time if h < h_max: upd_lbls_dict = next_upd_lbls_dict next_upd_lbls_dict = defaultdict(dict) if not all_iter: features_dict = defaultdict(list) feature_counts_dict = defaultdict(list) idx_of_lbl_dict = defaultdict(dict) return feature_mat_of_param, extr_time_of_param
def extract_features(graph_meta_data_of_num, h_range): extr_start_time = time.time() feature_mat_of_param = {} extr_time_of_param = {} mat_constr_times = [] h_max = max(h_range) # the keys are graph numbers and the values are lists of features features_dict = defaultdict(list) # the keys are graph numbers and the values are lists which contain the number # of occurences of the features corresponding to the feature at the same index # in the feature list in features_dict, that is # feature_counts_dict[graph_number][i] == number of occurences of feature # features_dict[graph_number][i] feature_counts_dict = defaultdict(list) # the keys are graph numbers and the values are dictionaries which map # features to their position in features_dict[graph_number] and # feature_counts_dict[graph_number], respectively idx_of_lbl_dict = defaultdict(dict) # the keys are graph numbers and the values are dictionaries which map # nodes to their updated label next_upd_lbls_dict = defaultdict(dict) upd_lbls_dict = defaultdict(dict) # keys are the node labels which are stored in the dataset and the values are # new compressed labels compr_func = {} # next_compr_lbl is used for assigning new compressed labels to the nodes # These build the features (= columns in feature_mat) used for the explicit # graph embedding next_compr_lbl = 0 #============================================================================= # 1) extract features iterating over all graphs in the dataset #============================================================================= for h in h_range: for graph_num, (graph_path, class_lbl) in\ graph_meta_data_of_num.iteritems(): # !! if graph_num % 100 == 0: print 'h = ' + str(h) + ', graph_num = ' + str(graph_num) # load graph G = pz.load(graph_path) for v in G.nodes_iter(): if h == 0: uncompr_lbl = G.node[v]['label'] if isinstance(uncompr_lbl, np.ndarray): uncompr_lbl = utils.calc_hash_of_array(uncompr_lbl) else: # r > 0 has_elem, nbrs_iter = utils.has_elem(G.neighbors_iter(v)) if not has_elem: # node v has no neighbors next_upd_lbls_dict[graph_num][v] =\ upd_lbls_dict[graph_num][v] continue # determine the list of labels of the nodes adjacent to v nbrs_lbls = [] for v_nbr in nbrs_iter: nbrs_lbls.append(upd_lbls_dict[graph_num][v_nbr]) # sort nbrs_lbls in ascending order if len(nbrs_lbls) > 1: nbrs_lbls.sort() # concatenate the neighboring labels to the label of v uncompr_lbl = str(upd_lbls_dict[graph_num][v]) if len(nbrs_lbls) == 1: uncompr_lbl += ',' + str(nbrs_lbls[0]) elif len(nbrs_lbls) > 1: uncompr_lbl += ',' + ','.join(map(str, nbrs_lbls)) if not uncompr_lbl in compr_func: # assign a compressed label new_compr_lbl to uncompr_lbl new_compr_lbl = next_compr_lbl compr_func[uncompr_lbl] = new_compr_lbl next_compr_lbl += 1 else: # determine compressed label new_compr_lbl assigned to # uncompr_lbl new_compr_lbl = compr_func[uncompr_lbl] if new_compr_lbl not in idx_of_lbl_dict[graph_num]: # len(feature_counts_dict[graph_num]) # == len(features_dict[graph_num]) idx = len(feature_counts_dict[graph_num]) idx_of_lbl_dict[graph_num][new_compr_lbl] = idx # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl) features_dict[graph_num].append(new_compr_lbl) # set number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_compr_lbl) to 1 feature_counts_dict[graph_num].append(1) else: # features_dict[graph_num][idx] # == feature upd_lbls_dict[graph_num][v] (== new_compr_lbl) idx = idx_of_lbl_dict[graph_num][new_compr_lbl] # increase number of occurrences of the feature # upd_lbls_dict[graph_num][v] (== new_compr_lbl) feature_counts_dict[graph_num][idx] += 1 if h < h_max: # next_upd_lbls_dict[graph_num][v] == compr_func[lbl] # == new_compr_lbl next_upd_lbls_dict[graph_num][v] = new_compr_lbl #========================================================================= # 2) construct data matrix whose i-th row equals the i-th feature vector, # which comprises the features of the first r iterations #========================================================================= mat_constr_start_time = time.time() # list containing the features of all graphs features = [] # list containing the corresponding features counts of all graphs feature_counts = [] # list indicating to which graph (= row in feature_mat) the features in # the list features belong. The difference # feature_ptr[i+1] - feature_ptr[i] equals the number of specified entries # for row i. Consequently, the number of rows of feature_mat equals # len(feature_ptr) - 1. feature_ptr = [0] for graph_num in graph_meta_data_of_num.iterkeys(): features += features_dict[graph_num] feature_counts += feature_counts_dict[graph_num] feature_ptr.append(feature_ptr[-1] + len(features_dict[graph_num])) # feature_mat is of type csr_matrix and has the following form: # [feature vector of the first graph, # feature vector of the second graph, # . # . # feature vector of the last graph] feature_mat = csr_matrix((np.array(feature_counts), np.array(features), np.array(feature_ptr)), shape = (len(graph_meta_data_of_num), len(compr_func)), dtype = np.float64) feature_mat_of_param[h] = feature_mat extr_end_time = time.time() extr_time = extr_end_time - extr_start_time - sum(mat_constr_times) mat_constr_end_time = time.time() mat_constr_time = mat_constr_end_time - mat_constr_start_time mat_constr_times.append(mat_constr_time) extr_time += mat_constr_time extr_time_of_param[h] = extr_time if h < h_max: upd_lbls_dict = next_upd_lbls_dict next_upd_lbls_dict = defaultdict(dict) return feature_mat_of_param, extr_time_of_param
def compute_kernel_mat(graph_meta_data_of_num, param_range = [None]): kernel_mat_comp_start_time = time.time() kernel_mat_comp_time_of_param = {} kernel_mat_of_param = {} num_graphs = len(graph_meta_data_of_num) kernel_mat = np.zeros((num_graphs, num_graphs), dtype = np.float64) # decaying factor LAMBDA for down_weighting longer walks LAMBDA = -4 #========================================================================== # 1) precompute the (sparse) adjacency matrices of the graphs in the # dataset #========================================================================== adj_mats = [] for i, (graph_path, class_lbl) in \ enumerate(graph_meta_data_of_num.itervalues()): # !! # if i % 10 == 0: # print i # load graph G = pz.load(graph_path) # determine its adjacency matrix A = nx.adj_matrix(G, weight = None) adj_mats.append(A) #========================================================================== # 2) compute kernel matrix over all graphs in the dataset #========================================================================== for i in xrange(num_graphs): A_i = adj_mats[i].todense() for j in xrange(i, num_graphs): A_j = adj_mats[j].todense() # apply preconditioned conjugate gradient method in order to solve # (I - lambda_*A_x) * x = 1_vec, where A_x is the adjacency matrix # of the direct product graph of G_i and G_j, I is the identity # matrix and 1_vec is vector with all entries set to 1. b = np.ones((A_i.shape[0] * A_j.shape[0], 1)) x, flag, rel_res, iter_, res_vec \ = pcg.pcg(lambda x: mat_vec_product(x, A_i, A_j, LAMBDA), b, 1e-6, 20) kernel_mat[i,j] = np.sum(x) if i != j: kernel_mat[j, i] = kernel_mat[i, j] print 'i =', i, 'j =', j, kernel_mat[i, j] kernel_mat_of_param[None] = kernel_mat kernel_mat_comp_end_time = time.time() kernel_mat_comp_time_of_param[None] = kernel_mat_comp_end_time \ - kernel_mat_comp_start_time return kernel_mat_of_param, kernel_mat_comp_time_of_param