Exemple #1
0
def get_sp_delta_adj_mat(A_pre, file_path, node2idx_dict, sep='\t'):
    N = len(list(node2idx_dict.keys()))
    A_cur = get_sp_adj_mat(file_path, node2idx_dict, sep=sep)
    delta_A = lil_matrix((N, N))
    pre_row, pre_col, pre_value = find(A_pre)
    cur_row, cur_col, cur_values = find(A_cur)
    pre_edge_num = pre_row.size
    cur_edge_num = cur_row.size

    pre_dict = dict(zip(zip(pre_row, pre_col), pre_value))
    cur_dict = dict(zip(zip(cur_row, cur_col), cur_values))
    for idx in range(pre_edge_num):
        edge = (pre_row[idx], pre_col[idx])
        if edge in cur_dict:
            delta_A[pre_row[idx], pre_col[idx]] = cur_dict[edge] - pre_dict[edge]
        else:
            delta_A[pre_row[idx], pre_col[idx]] = - pre_dict[edge]

    for idx in range(cur_edge_num):
        edge = (cur_row[idx], cur_col[idx])
        if edge in pre_dict:
            delta_A[cur_row[idx], cur_col[idx]] = cur_dict[edge] - pre_dict[edge]
        else:
            delta_A[cur_row[idx], cur_col[idx]] = cur_dict[edge]

    delta_A = delta_A.tocsr()
    return delta_A
Exemple #2
0
 def get_degree_feature_list(self, origin_base_path, start_idx, duration, sep='\t', init_type='gaussian', std=1e-4):
     assert init_type in ['gaussian', 'adj', 'combine', 'one-hot']
     x_list = []
     max_degree = 0
     adj_list = []
     degree_list = []
     date_dir_list = sorted(os.listdir(origin_base_path))
     # find the maximal degree for a list of graphs
     for i in range(start_idx, min(start_idx + duration, self.max_time_num)):
         original_graph_path = os.path.join(origin_base_path, date_dir_list[i])
         adj = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep)
         adj_list.append(adj)
         degrees = adj.sum(axis=1).astype(np.int)
         max_degree = max(max_degree, degrees.max())
         degree_list.append(degrees)
     # generate degree_based features
     input_dim = 0
     for i, degrees in enumerate(degree_list):
         # other structural feature initialization techniques can also be tried to improve performance
         if init_type == 'gaussian':
             fea_list = []
             for degree in degrees:
                 fea_list.append(np.random.normal(degree, std, max_degree + 1))
             fea_arr = np.array(fea_list).astype(np.float32)
             input_dim = fea_arr.shape[1]
             fea_tensor = torch.from_numpy(fea_arr).float()
             x_list.append(fea_tensor.cuda() if self.has_cuda else fea_tensor)
         elif init_type == 'adj':
             input_dim = self.node_num
             feat_tensor = sparse_mx_to_torch_sparse_tensor(adj_list[i])
             x_list.append(feat_tensor.cuda() if self.has_cuda else feat_tensor)
         elif init_type == 'combine':
             fea_list = []
             for degree in degrees:
                 fea_list.append(np.random.normal(degree, std, max_degree + 1))
             sp_feat = sp.coo_matrix(np.array(fea_list))
             sp_feat = sp.hstack((sp_feat, adj_list[i])).astype(np.float32)
             input_dim = sp_feat.shape[1]
             feat_tensor = sparse_mx_to_torch_sparse_tensor(sp_feat)
             x_list.append(feat_tensor.cuda() if self.has_cuda else feat_tensor)
         else:  # one-hot degree feature
             data = np.ones(degrees.shape[0], dtype=np.int)
             row = np.arange(degrees.shape[0])
             col = degrees.flatten().A[0]
             spmat = sp.csr_matrix((data, (row, col)), shape=(degrees.shape[0], max_degree + 1))
             sptensor = sparse_mx_to_torch_sparse_tensor(spmat)
             x_list.append(sptensor.cuda() if self.has_cuda else sptensor)
             # print('max degree: ', max_degree + 1)
             input_dim = max_degree + 1
     return x_list, input_dim
Exemple #3
0
 def get_walk_info(self,
                   f_name,
                   original_graph_path,
                   sep='\t',
                   weighted=True):
     print('f_name = ', f_name)
     t1 = time.time()
     spadj = get_sp_adj_mat(original_graph_path,
                            self.full_node_list,
                            sep=sep)
     rw.random_walk(spadj, self.walk_pair_base_path,
                    self.node_freq_base_path, f_name, self.walk_length,
                    self.walk_time, weighted)
     t2 = time.time()
     print('random walk tot time', t2 - t1, ' seconds!')
 def generate_node_similarity(self, file):
     """the implement of Vertex similarity in networks"""
     #  Vertex similarity in networks(https://arxiv.org/abs/physics/0510143)
     print('file = ', file)
     file_path = os.path.join(self.input_base_path, file)
     date = file.split('.')[0]
     output_file_path = os.path.join(self.output_base_path, date + '_similarity.npz')
     A = get_sp_adj_mat(file_path, self.full_node_list, sep=self.file_sep)
     A = A.tocsr()
     lambda_1 = scipy.sparse.linalg.eigsh(A, k=1, which='LM', return_eigenvectors=False)[0]
     print('lambda 1: ', lambda_1)
     rows, cols = A.nonzero()
     edge_num = rows.shape[0]
     n = A.shape[0]
     d = np.array(A.sum(1)).flatten()
     d_inv = np.zeros(n)  # dtype is float
     indices = np.where(d > 0)[0]
     d_inv[indices] = 1. / d[indices]
     d_inv = np.diag(d_inv)
     # dsd = np.random.normal(0, 1 / np.sqrt(n), (n, n))
     dsd = np.zeros((n, n))
     I = np.eye(n)
     for i in range(self.iter_num):
         # if i == 0:
         #      dsd = self.alpha / lambda_1 * A
         # else:
         #      dsd = self.alpha / lambda_1 * A + self.alpha / lambda_1 * dsd.dot(A)
         dsd = self.alpha / lambda_1 * A.dot(dsd) + I
         if i % 10 == 0:
             print('VS', i, '/', self.iter_num)
     # coeff = 2 * edge_num * lambda_1
     # S = d_inv.dot(dsd).dot(d_inv)
     S = dsd
     S = (S + S.T) / 2
     S = S - I
     S = (S - S.min()) / (S.max() - S.min())
     print(type(S))
     print('S max: ', S.max(), ', min: ', S.min())
     eps = 1e-6
     S[S < eps] = 0
     # S[S > 1] = 1
     S = sp.coo_matrix(S)
     sp.save_npz(output_file_path, S)
Exemple #5
0
 def get_date_adj_list(self, origin_base_path, start_idx, duration, sep='\t', normalize=False, row_norm=False, add_eye=False, data_type='tensor'):
     assert data_type in ['tensor', 'matrix']
     date_dir_list = sorted(os.listdir(origin_base_path))
     # print('adj list: ', date_dir_list)
     date_adj_list = []
     for i in range(start_idx, min(start_idx + duration, self.max_time_num)):
         original_graph_path = os.path.join(origin_base_path, date_dir_list[i])
         spmat = get_sp_adj_mat(original_graph_path, self.full_node_list, sep=sep)
         # spmat = sp.coo_matrix((np.exp(alpha * spmat.data), (spmat.row, spmat.col)), shape=(self.node_num, self.node_num))
         if add_eye:
             spmat = spmat + sp.eye(spmat.shape[0])
         if normalize:
             spmat = get_normalized_adj(spmat, row_norm=row_norm)
         # data type
         if data_type == 'tensor':
             sptensor = sparse_mx_to_torch_sparse_tensor(spmat)
             date_adj_list.append(sptensor.cuda() if self.has_cuda else sptensor)
         else:  # data_type == matrix
             date_adj_list.append(spmat)
     # print(len(date_adj_list))
     return date_adj_list
Exemple #6
0
def timers(nodes_file, input_base_path, output_base_path, Theta=0.17, dim=128, sep='\t', Update=True):
    if not os.path.exists(output_base_path):
        os.makedirs(output_base_path)
    # nodes set
    nodes_set = pd.read_csv(nodes_file, names=['node'])
    full_node_list = nodes_set['node'].tolist()
    N = len(full_node_list)
    print('node num: ', N)
    node2idx_dict = dict(zip(full_node_list, np.arange(N)))
    # base graph adj list
    f_list = sorted(os.listdir(input_base_path))
    f0 = os.path.join(input_base_path, f_list[0])
    A = get_sp_adj_mat(f0, node2idx_dict, sep=sep)

    # begin TIMERS
    K = dim
    time_slice = len(f_list) - 1

    # Store all results
    U = [[] for i in range(time_slice + 10)]
    S = [[] for i in range(time_slice + 10)]
    V = [[] for i in range(time_slice + 10)]
    Loss_store = np.zeros(shape=time_slice + 10)  # store loss for each time stamp
    Loss_bound = np.zeros(shape=time_slice + 10)  # store loss bound for each time stamp
    run_times = 0  # store how many rerun times
    Run_t = np.zeros(shape=time_slice + 10)  # store which timeslice re - run


    # Calculate Static Solution
    u, s, vt = svds(A, K)
    s = np.diag(s)
    v = vt.transpose()
    U[0], S[0], V[0] = u, s, v
    U_cur = np.dot(U[0], np.sqrt(S[0]))
    V_cur = np.dot(V[0], np.sqrt(S[0]))
    Loss_store[0] = Obj(A, U_cur, V_cur)
    Loss_bound[0] = Loss_store[0]
    output_data = np.hstack((U_cur, V_cur))
    assert output_data.shape[0] == N
    result = pd.DataFrame(data=output_data, index=full_node_list, columns=range(2 * dim))
    result.to_csv(os.path.join(output_base_path, f_list[0]), sep=sep)
    print('time = 1, loss = ', Loss_store[0], ', loss_bound=', Loss_bound[0])
    # Store some useful variable
    Sim = A.copy()
    S_cum = A.copy()  # store cumulated similarity matrix
    del A

    S_perturb = csr_matrix((N, N))  # store cumulated perturbation from last rerun
    loss_rerun = Loss_store[0]  # store objective function of last rerun
    for i in range(1, time_slice + 1):

        # create the change in adjacency matrix
        fn = os.path.join(input_base_path, f_list[i])
        S_add = get_sp_delta_adj_mat(S_cum, fn, node2idx_dict, sep=sep)
        S_perturb = S_perturb + S_add
        if Update:
            # Some Updating Function Here
            [U[i], S[i], V[i]] = TRIP(U[i - 1], S[i - 1], V[i - 1], S_add)
            # We use TRIP as an example, while other variants are permitted (as discussed in the paper)
            # Note that TRIP doesn't ensure smaller loss value
            U_cur = np.dot(U[i], np.sqrt(S[i]))
            V_cur = np.dot(V[i], np.sqrt(S[i]))
            Loss_store[i] = Obj(S_cum + S_add, U_cur, V_cur)
        else:
            Loss_store[i] = Obj_SimChange(S_cum, S_add, U_cur, V_cur, Loss_store[i - 1])

        Loss_bound[i] = RefineBound(Sim, S_perturb, loss_rerun, K)
        S_cum = S_cum + S_add
        print('time = ', i + 1, ', loss = ', Loss_store[i], ', loss_bound=', Loss_bound[i])
        if (Loss_store[i] >= (1 + Theta) * Loss_bound[i]):
            print('Begin rerun at time stamp:', str(i + 1))
            Sim = S_cum.copy()
            S_perturb = csr_matrix((N, N))
            run_times = run_times + 1
            Run_t[run_times] = i

            u, s, vt = svds(Sim, K)
            s = np.diag(s)
            v = vt.transpose()

            U[i], S[i], V[i] = u, s, v

            U_cur = np.dot(U[i], np.sqrt(S[i]))
            V_cur = np.dot(V[i], np.sqrt(S[i]))
            loss_rerun = Obj(Sim, U_cur, V_cur)
            Loss_store[i] = loss_rerun
            Loss_bound[i] = loss_rerun
        print('time = ', i + 1, ', loss = ', Loss_store[i], ', loss_bound=', Loss_bound[i])
        assert U_cur.shape[0] == V_cur.shape[0]
        assert U_cur.shape[1] == V_cur.shape[1]
        output_data = np.hstack((U_cur, V_cur))
        assert output_data.shape[0] == N
        result = pd.DataFrame(data=output_data, index=full_node_list, columns=range(2 * dim))
        result.to_csv(os.path.join(output_base_path, f_list[i]), sep=sep)
    del S_cum, S_perturb, Sim
    del loss_rerun
    del U_cur, V_cur