def __init__(self, base_path, origin_folder, embedding_folder, node_list, model, loss, max_time_num, model_folder="model"): # file paths self.base_path = base_path self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder)) self.model_base_path = os.path.abspath(os.path.join(base_path, model_folder)) self.full_node_list = node_list self.node_num = len(self.full_node_list) # node num self.timestamp_list = sorted(os.listdir(self.origin_base_path)) # cpu gpu if torch.cuda.is_available(): print("GPU") device = torch.device("cuda: 0") else: print("CPU") device = torch.device("cpu") self.set_thread() self.device = device self.model = model self.loss = loss self.max_time_num = max_time_num check_and_make_path(self.embedding_base_path) check_and_make_path(self.model_base_path)
def process_result(dataset, rep_num, method_list): for method in method_list: base_path = os.path.join('../../data/' + dataset, 'node_classification_res_0') res_path = os.path.join(base_path, method + '_acc_record.csv') df_method = pd.read_csv(res_path, sep=',', header=0, names=['date', 'acc0']) for i in range(1, rep_num): base_path = os.path.join('../../data/' + dataset, 'node_classification_res_' + str(i)) res_path = os.path.join(base_path, method + '_acc_record.csv') df_rep = pd.read_csv(res_path, sep=',', header=0, names=['date', 'acc' + str(i)]) df_method = pd.concat([df_method, df_rep.iloc[:, [1]]], axis=1) output_base_path = os.path.join('../../data/' + dataset, 'node_classification_res') check_and_make_path(output_base_path) acc_list = ['acc' + str(i) for i in range(rep_num)] df_method['avg'] = df_method.loc[:, acc_list].mean(axis=1) df_method['max'] = df_method.loc[:, acc_list].max(axis=1) df_method['min'] = df_method.loc[:, acc_list].min(axis=1) output_path = os.path.join(output_base_path, method + '_acc_record.csv') df_method.to_csv(output_path, sep=',', index=False)
def __init__(self, base_path, input_folder, output_folder, node_file): self.base_path = base_path self.input_base_path = os.path.join(base_path, input_folder) self.output_base_path = os.path.join(base_path, output_folder) nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) check_and_make_path(self.input_base_path) check_and_make_path(self.output_base_path) return
def __init__(self, base_path, origin_folder, core_folder, node_file): self.base_path = base_path self.origin_base_path = os.path.abspath( os.path.join(base_path, origin_folder)) self.core_base_path = os.path.abspath( os.path.join(base_path, core_folder)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) # if os.path.exists(self.core_base_path): # shutil.rmtree(self.core_base_path) check_and_make_path(self.core_base_path)
def get_graph_from_edges(file_path, node_file, output_node_dir, output_edge_dir, sep='\t'): import random df_edges = pd.read_csv(file_path, sep=sep, header=0) all_edge_num = df_edges.shape[0] nodes_set = pd.read_csv(node_file, names=['node']) full_node_list = nodes_set['node'].tolist() check_and_make_path(output_node_dir) check_and_make_path(output_edge_dir) edge_num_list = [50, 100, 500, 1000, 5000, 10000, 70000] edge_idxs = np.arange(all_edge_num).tolist() for i, edge_num in enumerate(edge_num_list): sample_edge_idxs = random.sample(edge_idxs, edge_num) df_subgraph = df_edges.loc[sample_edge_idxs, :] node_list = pd.unique(pd.concat([df_subgraph['from_id'], df_subgraph['to_id']], axis=0)).tolist() df_nodes = pd.DataFrame(node_list, columns=['node']) df_nodes.to_csv(os.path.join(output_node_dir, str(i) + '.csv'), sep='\t', index=False) df_subgraph.to_csv(os.path.join(output_edge_dir, str(i) + '.csv'), sep='\t', index=False)
def __init__(self, base_path, input_folder, output_folder, node_file, label_file, trans_label_file, sep=' ', test_ratio=0.1, val_ratio=0.2): self.base_path = base_path self.input_base_path = os.path.join(base_path, input_folder) self.output_base_path = os.path.join(base_path, output_folder) nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) node2idx_dict = dict( zip(self.full_node_list, np.arange(self.node_num).tolist())) df_label = pd.read_csv(os.path.join(base_path, label_file), sep=sep, header=0, names=['node', 'label'], dtype=str) df_label['node'] = df_label['node'].apply(lambda x: 'U' + x) df_label['label'] = df_label['label'].apply(np.int) df_label['node'] = df_label['node'].apply(lambda x: node2idx_dict[x]) # print(node_idx_list) df_label.index = df_label['node'].tolist() df_label = df_label.loc[np.arange(self.node_num).tolist(), :] # print(df_label) self.label_list = df_label['label'].tolist() df_label.to_csv(os.path.join(base_path, trans_label_file), sep='\t', index=False) assert test_ratio + val_ratio < 1.0 self.test_ratio = test_ratio self.val_ratio = val_ratio check_and_make_path(self.input_base_path) check_and_make_path(self.output_base_path) return
def __init__(self, base_path, origin_folder, walk_pair_folder, node_freq_folder, node_file, walk_time=100, walk_length=5): self.base_path = base_path self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder)) self.walk_pair_base_path = os.path.abspath(os.path.join(base_path, walk_pair_folder)) self.node_freq_base_path = os.path.abspath(os.path.join(base_path, node_freq_folder)) node_path = os.path.abspath(os.path.join(base_path, node_file)) nodes_set = pd.read_csv(node_path, names=['node']) self.full_node_list = nodes_set['node'].tolist() self.walk_time = walk_time self.walk_length = walk_length # if os.path.exists(self.walk_pair_base_path): # shutil.rmtree(self.walk_pair_base_path) # if os.path.exists(self.node_freq_base_path): # shutil.rmtree(self.node_freq_base_path) check_and_make_path(self.walk_pair_base_path) check_and_make_path(self.node_freq_base_path)
def get_graph_from_nodes(file_path, node_file, output_node_dir, output_edge_dir, sep='\t'): import random df_edges = pd.read_csv(file_path, sep=sep, header=0) # node_list = pd.unique(pd.concat([df_edges['from_id'], df_edges['to_id']], axis=0)).tolist() nodes_set = pd.read_csv(node_file, names=['node']) full_node_list = nodes_set['node'].tolist() check_and_make_path(output_node_dir) check_and_make_path(output_edge_dir) nx_graph = get_nx_graph(file_path, full_node_list, sep=sep) node_num_list = [50, 100, 500, 1000, 5000, 10000] max_cc = max(nx.connected_components(nx_graph), key=len) node_list = list(max_cc) print(len(node_list)) for i, node_num in enumerate(node_num_list): start_node = random.sample(node_list, 1)[0] adj = nx_graph.adj node_dict = dict() node_dict[start_node] = 1 sample_list = [start_node] front, cnt = -1, 1 while front < cnt and cnt < node_num: front += 1 cur = sample_list[front] for neighbor, edge_attr in adj[cur].items(): if neighbor not in node_dict: node_dict[neighbor] = 1 cnt += 1 sample_list.append(neighbor) # print(sample_nodes) nx_subgraph = nx_graph.subgraph(sample_list) edge_list = [] df_nodes = pd.DataFrame([full_node_list[id] for id in sample_list], columns=['node']) df_nodes.to_csv(os.path.join(output_node_dir, str(i) + '.csv'), sep='\t', index=False) for node, neighbors in nx_subgraph.adj.items(): for neighbor, edge_attr in neighbors.items(): edge_list.append([full_node_list[node], full_node_list[neighbor], edge_attr['weight']]) edges_arr = np.array(edge_list) print('edges arr shape: ', edges_arr.shape[0]) df_output = pd.DataFrame(edges_arr, columns=['from_id', 'to_id', 'weight']) df_output.to_csv(os.path.join(output_edge_dir, str(i) + '.csv'), sep='\t', index=False) df_nodes = pd.DataFrame(np.array(full_node_list), columns=['node']) df_nodes.to_csv(os.path.join(output_node_dir, str(len(node_num_list)) + '.csv'), sep='\t', index=False) df_edges.to_csv(os.path.join(output_edge_dir, str(len(node_num_list)) + '.csv'), sep='\t', index=False)
def __init__(self, base_path, origin_folder, embedding_folder, lp_edge_folder, output_folder, node_file, train_ratio=1.0, test_ratio=1.0): self.base_path = base_path self.origin_base_path = os.path.join(base_path, origin_folder) self.embedding_base_path = os.path.join(base_path, embedding_folder) self.lp_edge_base_path = os.path.join(base_path, lp_edge_folder) self.output_base_path = os.path.join(base_path, output_folder) self.train_ratio = train_ratio self.test_ratio = test_ratio nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path) return
def __init__(self, base_path, input_folder, output_folder, node_file, test_ratio=0.3, val_ratio=0.2): self.base_path = base_path self.input_base_path = os.path.join(base_path, input_folder) self.output_base_path = os.path.join(base_path, output_folder) nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() self.node_num = len(self.full_node_list) assert test_ratio + val_ratio < 1.0 self.test_ratio = test_ratio self.val_ratio = val_ratio check_and_make_path(self.input_base_path) check_and_make_path(self.output_base_path) return
def get_kcore_graph(self, input_file, output_dir, core_list=None, degree_list=None): graph = get_nx_graph(input_file, self.full_node_list, sep='\t') core_num_dict = nx.core_number(graph) max_core_num = max(list(core_num_dict.values())) print('max core num: ', max_core_num) # x= list(graph.degree()) # max_degree = max(list(zip(*x))[1]) # # print('max degree: ', max_degree) # core_list.append(max_core_num) # degree_list.append(max_degree) check_and_make_path(output_dir) format_str = get_format_str(max_core_num) for i in range(1, max_core_num + 1): k_core_graph = nx.k_core(graph, k=i, core_number=core_num_dict) k_core_graph.add_nodes_from(np.arange(self.node_num)) A = nx.to_scipy_sparse_matrix(k_core_graph) signature = format_str.format(i) sp.save_npz(os.path.join(output_dir, signature + ".npz"), A) return
def __init__(self, base_path, origin_folder, embedding_folder, equ_folder, output_folder, node_file): self.base_path = base_path self.origin_base_path = os.path.join(base_path, origin_folder) self.embedding_base_path = os.path.join(base_path, embedding_folder) self.equ_base_path = os.path.join(base_path, equ_folder) self.output_base_path = os.path.join(base_path, output_folder) nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path) return
def __init__(self, base_path, origin_folder, embedding_folder, nodeclas_folder, output_folder, node_file, trans_label_file): self.base_path = base_path self.origin_base_path = os.path.join(base_path, origin_folder) self.embedding_base_path = os.path.join(base_path, embedding_folder) self.nodeclas_base_path = os.path.join(base_path, nodeclas_folder) self.output_base_path = os.path.join(base_path, output_folder) nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node']) self.full_node_list = nodes_set['node'].tolist() df_label = pd.read_csv(os.path.join(base_path, trans_label_file), sep='\t') self.label_list = df_label['label'].tolist() check_and_make_path(self.embedding_base_path) check_and_make_path(self.origin_base_path) check_and_make_path(self.output_base_path) return
def process_result(dataset, rep_num, method_list): for method in method_list: base_path = os.path.join('../../data/' + dataset, 'link_prediction_res_0') res_path = os.path.join(base_path, method + '_auc_record.csv') df_method = pd.read_csv(res_path, sep=',', header=0, names=['date', 'avg0', 'had0', 'l1_0', 'l2_0']) df_avg = df_method.loc[:, ['date', 'avg0']].copy() df_had = df_method.loc[:, ['date', 'had0']].copy() df_l1 = df_method.loc[:, ['date', 'l1_0']].copy() df_l2 = df_method.loc[:, ['date', 'l2_0']].copy() for i in range(1, rep_num): base_path = os.path.join('../../data/' + dataset, 'link_prediction_res_' + str(i)) res_path = os.path.join(base_path, method + '_auc_record.csv') df_rep = pd.read_csv(res_path, sep=',', header=0, names=[ 'date', 'avg' + str(i), 'had' + str(i), 'l1_' + str(i), 'l2_' + str(i) ]) df_avg = pd.concat([df_avg, df_rep.loc[:, ['avg' + str(i)]]], axis=1) df_had = pd.concat([df_had, df_rep.loc[:, ['had' + str(i)]]], axis=1) df_l1 = pd.concat([df_l1, df_rep.loc[:, ['l1_' + str(i)]]], axis=1) df_l2 = pd.concat([df_l2, df_rep.loc[:, ['l2_' + str(i)]]], axis=1) output_base_path = os.path.join('../../data/' + dataset, 'link_prediction_res') check_and_make_path(output_base_path) avg_list = ['avg' + str(i) for i in range(rep_num)] df_avg['avg'] = df_avg.loc[:, avg_list].mean(axis=1) df_avg['max'] = df_avg.loc[:, avg_list].max(axis=1) df_avg['min'] = df_avg.loc[:, avg_list].min(axis=1) output_path = os.path.join(output_base_path, method + '_avg_record.csv') df_avg.to_csv(output_path, sep=',', index=False) had_list = ['had' + str(i) for i in range(rep_num)] df_had['avg'] = df_had.loc[:, had_list].mean(axis=1) df_had['max'] = df_had.loc[:, had_list].max(axis=1) df_had['min'] = df_had.loc[:, had_list].min(axis=1) output_path = os.path.join(output_base_path, method + '_had_record.csv') df_had.to_csv(output_path, sep=',', index=False) l1_list = ['l1_' + str(i) for i in range(rep_num)] df_l1['avg'] = df_l1.loc[:, l1_list].mean(axis=1) df_l1['max'] = df_l1.loc[:, l1_list].max(axis=1) df_l1['min'] = df_l1.loc[:, l1_list].min(axis=1) output_path = os.path.join(output_base_path, method + '_l1_record.csv') df_l1.to_csv(output_path, sep=',', index=False) l2_list = ['l2_' + str(i) for i in range(rep_num)] df_l2['avg'] = df_l2.loc[:, l2_list].mean(axis=1) df_l2['max'] = df_l2.loc[:, l2_list].max(axis=1) df_l2['min'] = df_l2.loc[:, l2_list].min(axis=1) output_path = os.path.join(output_base_path, method + '_l2_record.csv') df_l2.to_csv(output_path, sep=',', index=False)