def get_sp_delta_adj_mat(A_pre, file_path, node2idx_dict, sep='\t'): N = len(list(node2idx_dict.keys())) A_cur = get_sp_adj_mat(file_path, node2idx_dict, sep=sep) delta_A = lil_matrix((N, N)) pre_row, pre_col, pre_value = find(A_pre) cur_row, cur_col, cur_values = find(A_cur) pre_edge_num = pre_row.size cur_edge_num = cur_row.size pre_dict = dict(zip(zip(pre_row, pre_col), pre_value)) cur_dict = dict(zip(zip(cur_row, cur_col), cur_values)) for idx in range(pre_edge_num): edge = (pre_row[idx], pre_col[idx]) if edge in cur_dict: delta_A[pre_row[idx], pre_col[idx]] = cur_dict[edge] - pre_dict[edge] else: delta_A[pre_row[idx], pre_col[idx]] = - pre_dict[edge] for idx in range(cur_edge_num): edge = (cur_row[idx], cur_col[idx]) if edge in pre_dict: delta_A[cur_row[idx], cur_col[idx]] = cur_dict[edge] - pre_dict[edge] else: delta_A[cur_row[idx], cur_col[idx]] = cur_dict[edge] delta_A = delta_A.tocsr() return delta_A
def get_degree_feature_list(self, origin_base_path, start_idx, duration, sep='\t', init_type='gaussian', std=1e-4): assert init_type in ['gaussian', 'adj', 'combine', 'one-hot'] x_list = [] max_degree = 0 adj_list = [] degree_list = [] date_dir_list = sorted(os.listdir(origin_base_path)) # find the maximal degree for a list of graphs for i in range(start_idx, min(start_idx + duration, self.max_time_num)): original_graph_path = os.path.join(origin_base_path, date_dir_list[i]) adj = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep) adj_list.append(adj) degrees = adj.sum(axis=1).astype(np.int) max_degree = max(max_degree, degrees.max()) degree_list.append(degrees) # generate degree_based features input_dim = 0 for i, degrees in enumerate(degree_list): # other structural feature initialization techniques can also be tried to improve performance if init_type == 'gaussian': fea_list = [] for degree in degrees: fea_list.append(np.random.normal(degree, std, max_degree + 1)) fea_arr = np.array(fea_list).astype(np.float32) input_dim = fea_arr.shape[1] fea_tensor = torch.from_numpy(fea_arr).float() x_list.append(fea_tensor.cuda() if self.has_cuda else fea_tensor) elif init_type == 'adj': input_dim = self.node_num feat_tensor = sparse_mx_to_torch_sparse_tensor(adj_list[i]) x_list.append(feat_tensor.cuda() if self.has_cuda else feat_tensor) elif init_type == 'combine': fea_list = [] for degree in degrees: fea_list.append(np.random.normal(degree, std, max_degree + 1)) sp_feat = sp.coo_matrix(np.array(fea_list)) sp_feat = sp.hstack((sp_feat, adj_list[i])).astype(np.float32) input_dim = sp_feat.shape[1] feat_tensor = sparse_mx_to_torch_sparse_tensor(sp_feat) x_list.append(feat_tensor.cuda() if self.has_cuda else feat_tensor) else: # one-hot degree feature data = np.ones(degrees.shape[0], dtype=np.int) row = np.arange(degrees.shape[0]) col = degrees.flatten().A[0] spmat = sp.csr_matrix((data, (row, col)), shape=(degrees.shape[0], max_degree + 1)) sptensor = sparse_mx_to_torch_sparse_tensor(spmat) x_list.append(sptensor.cuda() if self.has_cuda else sptensor) # print('max degree: ', max_degree + 1) input_dim = max_degree + 1 return x_list, input_dim
def get_walk_info(self, f_name, original_graph_path, sep='\t', weighted=True): print('f_name = ', f_name) t1 = time.time() spadj = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep) rw.random_walk(spadj, self.walk_pair_base_path, self.node_freq_base_path, f_name, self.walk_length, self.walk_time, weighted) t2 = time.time() print('random walk tot time', t2 - t1, ' seconds!')
def generate_node_similarity(self, file): """the implement of Vertex similarity in networks""" # Vertex similarity in networks(https://arxiv.org/abs/physics/0510143) print('file = ', file) file_path = os.path.join(self.input_base_path, file) date = file.split('.')[0] output_file_path = os.path.join(self.output_base_path, date + '_similarity.npz') A = get_sp_adj_mat(file_path, self.full_node_list, sep=self.file_sep) A = A.tocsr() lambda_1 = scipy.sparse.linalg.eigsh(A, k=1, which='LM', return_eigenvectors=False)[0] print('lambda 1: ', lambda_1) rows, cols = A.nonzero() edge_num = rows.shape[0] n = A.shape[0] d = np.array(A.sum(1)).flatten() d_inv = np.zeros(n) # dtype is float indices = np.where(d > 0)[0] d_inv[indices] = 1. / d[indices] d_inv = np.diag(d_inv) # dsd = np.random.normal(0, 1 / np.sqrt(n), (n, n)) dsd = np.zeros((n, n)) I = np.eye(n) for i in range(self.iter_num): # if i == 0: # dsd = self.alpha / lambda_1 * A # else: # dsd = self.alpha / lambda_1 * A + self.alpha / lambda_1 * dsd.dot(A) dsd = self.alpha / lambda_1 * A.dot(dsd) + I if i % 10 == 0: print('VS', i, '/', self.iter_num) # coeff = 2 * edge_num * lambda_1 # S = d_inv.dot(dsd).dot(d_inv) S = dsd S = (S + S.T) / 2 S = S - I S = (S - S.min()) / (S.max() - S.min()) print(type(S)) print('S max: ', S.max(), ', min: ', S.min()) eps = 1e-6 S[S < eps] = 0 # S[S > 1] = 1 S = sp.coo_matrix(S) sp.save_npz(output_file_path, S)
def get_date_adj_list(self, origin_base_path, start_idx, duration, sep='\t', normalize=False, row_norm=False, add_eye=False, data_type='tensor'): assert data_type in ['tensor', 'matrix'] date_dir_list = sorted(os.listdir(origin_base_path)) # print('adj list: ', date_dir_list) date_adj_list = [] for i in range(start_idx, min(start_idx + duration, self.max_time_num)): original_graph_path = os.path.join(origin_base_path, date_dir_list[i]) spmat = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep) # spmat = sp.coo_matrix((np.exp(alpha * spmat.data), (spmat.row, spmat.col)), shape=(self.node_num, self.node_num)) if add_eye: spmat = spmat + sp.eye(spmat.shape[0]) if normalize: spmat = get_normalized_adj(spmat, row_norm=row_norm) # data type if data_type == 'tensor': sptensor = sparse_mx_to_torch_sparse_tensor(spmat) date_adj_list.append(sptensor.cuda() if self.has_cuda else sptensor) else: # data_type == matrix date_adj_list.append(spmat) # print(len(date_adj_list)) return date_adj_list
def timers(nodes_file, input_base_path, output_base_path, Theta=0.17, dim=128, sep='\t', Update=True): if not os.path.exists(output_base_path): os.makedirs(output_base_path) # nodes set nodes_set = pd.read_csv(nodes_file, names=['node']) full_node_list = nodes_set['node'].tolist() N = len(full_node_list) print('node num: ', N) node2idx_dict = dict(zip(full_node_list, np.arange(N))) # base graph adj list f_list = sorted(os.listdir(input_base_path)) f0 = os.path.join(input_base_path, f_list[0]) A = get_sp_adj_mat(f0, node2idx_dict, sep=sep) # begin TIMERS K = dim time_slice = len(f_list) - 1 # Store all results U = [[] for i in range(time_slice + 10)] S = [[] for i in range(time_slice + 10)] V = [[] for i in range(time_slice + 10)] Loss_store = np.zeros(shape=time_slice + 10) # store loss for each time stamp Loss_bound = np.zeros(shape=time_slice + 10) # store loss bound for each time stamp run_times = 0 # store how many rerun times Run_t = np.zeros(shape=time_slice + 10) # store which timeslice re - run # Calculate Static Solution u, s, vt = svds(A, K) s = np.diag(s) v = vt.transpose() U[0], S[0], V[0] = u, s, v U_cur = np.dot(U[0], np.sqrt(S[0])) V_cur = np.dot(V[0], np.sqrt(S[0])) Loss_store[0] = Obj(A, U_cur, V_cur) Loss_bound[0] = Loss_store[0] output_data = np.hstack((U_cur, V_cur)) assert output_data.shape[0] == N result = pd.DataFrame(data=output_data, index=full_node_list, columns=range(2 * dim)) result.to_csv(os.path.join(output_base_path, f_list[0]), sep=sep) print('time = 1, loss = ', Loss_store[0], ', loss_bound=', Loss_bound[0]) # Store some useful variable Sim = A.copy() S_cum = A.copy() # store cumulated similarity matrix del A S_perturb = csr_matrix((N, N)) # store cumulated perturbation from last rerun loss_rerun = Loss_store[0] # store objective function of last rerun for i in range(1, time_slice + 1): # create the change in adjacency matrix fn = os.path.join(input_base_path, f_list[i]) S_add = get_sp_delta_adj_mat(S_cum, fn, node2idx_dict, sep=sep) S_perturb = S_perturb + S_add if Update: # Some Updating Function Here [U[i], S[i], V[i]] = TRIP(U[i - 1], S[i - 1], V[i - 1], S_add) # We use TRIP as an example, while other variants are permitted (as discussed in the paper) # Note that TRIP doesn't ensure smaller loss value U_cur = np.dot(U[i], np.sqrt(S[i])) V_cur = np.dot(V[i], np.sqrt(S[i])) Loss_store[i] = Obj(S_cum + S_add, U_cur, V_cur) else: Loss_store[i] = Obj_SimChange(S_cum, S_add, U_cur, V_cur, Loss_store[i - 1]) Loss_bound[i] = RefineBound(Sim, S_perturb, loss_rerun, K) S_cum = S_cum + S_add print('time = ', i + 1, ', loss = ', Loss_store[i], ', loss_bound=', Loss_bound[i]) if (Loss_store[i] >= (1 + Theta) * Loss_bound[i]): print('Begin rerun at time stamp:', str(i + 1)) Sim = S_cum.copy() S_perturb = csr_matrix((N, N)) run_times = run_times + 1 Run_t[run_times] = i u, s, vt = svds(Sim, K) s = np.diag(s) v = vt.transpose() U[i], S[i], V[i] = u, s, v U_cur = np.dot(U[i], np.sqrt(S[i])) V_cur = np.dot(V[i], np.sqrt(S[i])) loss_rerun = Obj(Sim, U_cur, V_cur) Loss_store[i] = loss_rerun Loss_bound[i] = loss_rerun print('time = ', i + 1, ', loss = ', Loss_store[i], ', loss_bound=', Loss_bound[i]) assert U_cur.shape[0] == V_cur.shape[0] assert U_cur.shape[1] == V_cur.shape[1] output_data = np.hstack((U_cur, V_cur)) assert output_data.shape[0] == N result = pd.DataFrame(data=output_data, index=full_node_list, columns=range(2 * dim)) result.to_csv(os.path.join(output_base_path, f_list[i]), sep=sep) del S_cum, S_perturb, Sim del loss_rerun del U_cur, V_cur