Example #1
0
def get_graph_from_edges(file_path,
                         node_file,
                         output_node_dir,
                         output_edge_dir,
                         sep='\t'):
    import random
    df_edges = pd.read_csv(file_path, sep=sep, header=0)
    all_edge_num = df_edges.shape[0]
    check_and_make_path(output_node_dir)
    check_and_make_path(output_edge_dir)
    edge_num_list = [50, 100, 500, 1000, 5000, 10000, 70000]
    edge_indices = np.arange(all_edge_num).tolist()
    for i, edge_num in enumerate(edge_num_list):
        sample_edge_indices = random.sample(edge_indices, edge_num)
        df_subgraph = df_edges.loc[sample_edge_indices, :]
        node_list = pd.unique(
            pd.concat([df_subgraph['from_id'], df_subgraph['to_id']],
                      axis=0)).tolist()
        df_nodes = pd.DataFrame(node_list, columns=['node'])
        df_nodes.to_csv(os.path.join(output_node_dir,
                                     str(i) + '.csv'),
                        sep='\t',
                        index=False)
        df_subgraph.to_csv(os.path.join(output_edge_dir,
                                        str(i) + '.csv'),
                           sep='\t',
                           index=False)
Example #2
0
    def __init__(self,
                 base_path,
                 origin_folder,
                 walk_pair_folder,
                 node_freq_folder,
                 node_file,
                 walk_time=100,
                 walk_length=5):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(
            os.path.join(base_path, origin_folder))
        self.walk_pair_base_path = os.path.abspath(
            os.path.join(base_path, walk_pair_folder))
        self.node_freq_base_path = os.path.abspath(
            os.path.join(base_path, node_freq_folder))

        node_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()

        self.walk_time = walk_time
        self.walk_length = walk_length

        check_and_make_path(self.walk_pair_base_path)
        check_and_make_path(self.node_freq_base_path)
Example #3
0
    def get_kcore_graph(self,
                        input_file,
                        output_dir,
                        sep='\t',
                        core_list=None,
                        degree_list=None):
        input_path = os.path.join(self.origin_base_path, input_file)
        graph = get_nx_graph(input_path, self.full_node_list, sep=sep)
        core_num_dict = nx.core_number(graph)
        print("unique core nums: ",
              len(np.unique(np.array(list(core_num_dict.values())))))
        max_core_num = max(list(core_num_dict.values()))
        print('file name: ', input_file, 'max core num: ', max_core_num)

        # x = list(graph.degree())
        # max_degree = max(list(zip(*x))[1])
        # print('max degree: ', max_degree)
        # core_list.append(max_core_num)
        # degree_list.append(max_degree)
        check_and_make_path(output_dir)

        format_str = get_format_str(max_core_num)
        for i in range(1, max_core_num + 1):
            k_core_graph = nx.k_core(graph, k=i, core_number=core_num_dict)
            k_core_graph.add_nodes_from(self.full_node_list)
            ###############################
            # This node_list is quit important, or it will change the graph adjacent matrix and cause bugs!!!
            A = nx.to_scipy_sparse_matrix(k_core_graph,
                                          nodelist=self.full_node_list)
            ###############################
            signature = format_str.format(i)
            sp.save_npz(os.path.join(output_dir, signature + '.npz'), A)
Example #4
0
def aggregate_results(base_path, lp_res_folder, start_idx, rep_num, method_list, measure_list):
    if rep_num <= 0:
        return
    # Aggregate link prediction results when rep_num > 0
    for method in method_list:
        res_base_path = os.path.join(base_path, lp_res_folder + '_' + str(start_idx))
        res_path = os.path.join(res_base_path, method + '_auc_record.csv')
        column_names = ['date'] + [measure + '_' + str(start_idx) for measure in measure_list]
        df_method = pd.read_csv(res_path, sep=',', header=0, names=column_names)
        measure_df_dict = dict()
        for measure in measure_list:
            df_measure = df_method.loc[:, ['date', measure + '_' + str(start_idx)]].copy()
            measure_df_dict[measure] = df_measure
        for i in range(start_idx + 1, start_idx + rep_num):
            res_base_path = os.path.join(base_path, lp_res_folder + '_' + str(i))
            res_path = os.path.join(res_base_path, method + '_auc_record.csv')
            column_names = ['date'] + [measure + '_' + str(i) for measure in measure_list]
            df_rep = pd.read_csv(res_path, sep=',', header=0, names=column_names)
            for measure in measure_list:
                measure_df_dict[measure] = pd.concat([measure_df_dict[measure], df_rep.loc[:, [measure + '_' + str(i)]]], axis=1)
        output_base_path = os.path.join(base_path, lp_res_folder)
        check_and_make_path(output_base_path)

        for measure in measure_list:
            measure_column = [measure + '_' + str(i) for i in range(start_idx, start_idx + rep_num)]
            df_measure = measure_df_dict[measure]
            df_measure['avg'] = df_measure.loc[:, measure_column].mean(axis=1)
            df_measure['max'] = df_measure.loc[:, measure_column].max(axis=1)
            df_measure['min'] = df_measure.loc[:, measure_column].min(axis=1)
            output_path = os.path.join(output_base_path, method + '_' + measure + '_record.csv')
            df_measure.to_csv(output_path, sep=',', index=False)
Example #5
0
    def __init__(self, base_path, origin_folder, core_folder, node_file):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(
            os.path.join(base_path, origin_folder))
        self.core_base_path = os.path.abspath(
            os.path.join(base_path, core_folder))

        node_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)

        check_and_make_path(self.core_base_path)
Example #6
0
def copy_labels():
    input_dir = '/data/america_air/nodes_set'
    label_file = 'labels.csv'
    label_path = os.path.join(input_dir, label_file)
    output_dir = '/data/america_air/nodes_label'
    check_and_make_path(output_dir)
    copy_node_labels(label_path, output_dir)

    input_dir = '/data/europe_air/nodes_set'
    label_file = 'labels.csv'
    label_path = os.path.join(input_dir, label_file)
    output_dir = '/data/europe_air/nodes_label'
    check_and_make_path(output_dir)
    copy_node_labels(label_path, output_dir)
    def __init__(self, base_path, input_folder, output_folder, node_file, file_sep='\t', alpha=0.5, iter_num=100):
        self.base_path = base_path
        self.input_base_path = os.path.abspath(os.path.join(base_path, input_folder))
        self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder))
        self.file_sep = file_sep

        node_file_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_file_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)
        self.alpha = alpha
        self.iter_num = iter_num
        assert 0 < self.alpha < 1

        check_and_make_path(self.input_base_path)
        check_and_make_path(self.output_base_path)
Example #8
0
    def __init__(self, base_path, origin_folder, embedding_folder, node_list, model, loss, model_folder='model', file_sep='\t', has_cuda=False):
        # file paths
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
        self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder))
        self.model_base_path = os.path.abspath(os.path.join(base_path, model_folder))
        self.has_cuda = has_cuda
        self.device = torch.device('cuda: 0') if has_cuda else torch.device('cpu')
        self.model = model
        self.loss = loss

        self.file_sep = file_sep
        self.full_node_list = node_list
        self.node_num = len(self.full_node_list)  # node num
        self.timestamp_list = sorted(os.listdir(self.origin_base_path))

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.model_base_path)
Example #9
0
    def __init__(self, base_path, input_folder, output_folder, node_file, file_sep='\t', train_ratio=0.5, val_ratio=0.2, test_ratio=0.3):
        self.base_path = base_path
        self.input_base_path = os.path.join(base_path, input_folder)
        self.output_base_path = os.path.join(base_path, output_folder)
        self.file_sep = file_sep

        nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.node_num = len(self.full_node_list)
        self.node2idx_dict = dict(zip(self.full_node_list, np.arange(self.node_num)))

        assert train_ratio + test_ratio + val_ratio <= 1.0
        self.train_ratio = train_ratio
        self.test_ratio = test_ratio
        self.val_ratio = val_ratio

        check_and_make_path(self.input_base_path)
        check_and_make_path(self.output_base_path)
Example #10
0
def aggregate_results(base_path, edgecls_res_folder, start_idx, rep_num, method_list):
    if rep_num <= 0:
        return
    # Aggregate edge classification results when rep_num > 0
    for method in method_list:
        res_base_path = os.path.join(base_path, edgecls_res_folder + '_' + str(start_idx))
        res_path = os.path.join(res_base_path, method + '_acc_record.csv')
        df_method = pd.read_csv(res_path, sep=',', header=0, names=['date', 'acc_' + str(start_idx)])
        for i in range(start_idx + 1, start_idx + rep_num):
            res_base_path = os.path.join(base_path, edgecls_res_folder + '_' + str(i))
            res_path = os.path.join(res_base_path, method + '_acc_record.csv')
            df_rep = pd.read_csv(res_path, sep=',', header=0, names=['date', 'acc_' + str(i)])
            df_method = pd.concat([df_method, df_rep.iloc[:, [1]]], axis=1)
        output_base_path = os.path.join(base_path, edgecls_res_folder)
        check_and_make_path(output_base_path)
        acc_list = ['acc_' + str(i) for i in range(start_idx, start_idx + rep_num)]
        df_method['avg'] = df_method.loc[:, acc_list].mean(axis=1)
        df_method['max'] = df_method.loc[:, acc_list].max(axis=1)
        df_method['min'] = df_method.loc[:, acc_list].min(axis=1)
        output_path = os.path.join(output_base_path, method + '_acc_record.csv')
        df_method.to_csv(output_path, sep=',', index=False)
Example #11
0
    def __init__(self,
                 base_path,
                 origin_folder,
                 embedding_folder,
                 centrality_folder,
                 output_folder,
                 node_file,
                 file_sep='\t',
                 alpha_list=None,
                 split_fold=5):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(
            os.path.join(base_path, origin_folder))
        self.embedding_base_path = os.path.abspath(
            os.path.join(base_path, embedding_folder))
        self.centrality_base_path = os.path.abspath(
            os.path.join(base_path, centrality_folder))
        self.output_base_path = os.path.abspath(
            os.path.join(base_path, output_folder))
        self.file_sep = file_sep

        node_file_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_file_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.alpha_list = alpha_list
        self.split_fold = split_fold

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
    def __init__(self, base_path, origin_folder, embedding_folder, similarity_folder, output_folder, node_file, file_sep='\t'):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
        self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder))
        self.similarity_base_path = os.path.abspath(os.path.join(base_path, similarity_folder))
        self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder))
        self.file_sep = file_sep

        node_file_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_file_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
Example #13
0
    def __init__(self, base_path, origin_folder, embedding_folder, lp_edge_folder, output_folder, node_file, file_sep='\t', C_list=None, measure_list=None, max_iter=5000):
        self.base_path = base_path
        self.origin_base_path = os.path.join(base_path, origin_folder)
        self.embedding_base_path = os.path.join(base_path, embedding_folder)
        self.lp_edge_base_path = os.path.join(base_path, lp_edge_folder)
        self.output_base_path = os.path.join(base_path, output_folder)
        self.file_sep = file_sep
        self.measure_list = measure_list

        nodes_set = pd.read_csv(os.path.join(base_path, node_file), names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.C_list = C_list
        self.max_iter = max_iter

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
Example #14
0
    def __init__(self, base_path, origin_folder, embedding_folder, edgeclas_folder, output_folder, node_file, label_folder, file_sep='\t', C_list=None, max_iter=5000):
        self.base_path = base_path
        self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
        self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder))
        self.edgeclas_base_path = os.path.abspath(os.path.join(base_path, edgeclas_folder))
        self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder))
        self.file_sep = file_sep

        node_file_path = os.path.abspath(os.path.join(base_path, node_file))
        nodes_set = pd.read_csv(node_file_path, names=['node'])
        self.full_node_list = nodes_set['node'].tolist()
        self.label_base_path = os.path.abspath(os.path.join(base_path, label_folder))
        f_list = os.listdir(self.label_base_path)
        assert len(f_list) > 0
        label_path = os.path.join(self.label_base_path, f_list[0])
        df_label = pd.read_csv(label_path, sep=file_sep)
        self.unique_labels = df_label['label'].unique()
        self.C_list = C_list
        self.max_iter = max_iter

        check_and_make_path(self.embedding_base_path)
        check_and_make_path(self.origin_base_path)
        check_and_make_path(self.output_base_path)
Example #15
0
def get_graph_from_nodes(file_path,
                         node_file,
                         output_node_dir,
                         output_edge_dir,
                         sep='\t'):
    import random
    df_edges = pd.read_csv(file_path, sep=sep, header=0)
    # node_list = pd.unique(pd.concat([df_edges['from_id'], df_edges['to_id']], axis=0)).tolist()
    nodes_set = pd.read_csv(node_file, names=['node'])
    full_node_list = nodes_set['node'].tolist()
    print('node number: ', len(full_node_list))
    check_and_make_path(output_node_dir)
    check_and_make_path(output_edge_dir)
    nx_graph = get_nx_graph(file_path, full_node_list, sep=sep)
    node_num_list = [50, 100, 500, 1000, 5000, 10000]
    max_cc = max(nx.connected_components(nx_graph), key=len)
    node_list = list(max_cc)
    print(node_list[:10])
    print(len(node_list))
    for i, node_num in enumerate(node_num_list):
        start_node = random.sample(node_list, 1)[0]
        adj = nx_graph.adj
        node_dict = dict()
        node_dict[start_node] = 1
        sample_list = [start_node]
        front, cnt = -1, 1
        while front < cnt and cnt < node_num:
            front += 1
            # print('front = ', front)
            cur = sample_list[front]
            for neighbor, edge_attr in adj[cur].items():
                if neighbor not in node_dict:
                    node_dict[neighbor] = 1
                    cnt += 1
                    sample_list.append(neighbor)
                    if cnt >= node_num:
                        break
            if cnt > node_num:
                break
        # print(sample_nodes)
        print('i = ', i, 'cnt = ', cnt)
        nx_subgraph = nx_graph.subgraph(sample_list)
        edge_list = []
        df_nodes = pd.DataFrame(sample_list, columns=['node'])
        df_nodes.to_csv(os.path.join(output_node_dir,
                                     str(i) + '.csv'),
                        sep='\t',
                        index=False,
                        header=False)
        for node, neighbors in nx_subgraph.adj.items():
            for neighbor, edge_attr in neighbors.items():
                edge_list.append([node, neighbor, edge_attr['weight']])
        edges_arr = np.array(edge_list)
        print('edges arr shape: ', edges_arr.shape[0])
        df_output = pd.DataFrame(edges_arr,
                                 columns=['from_id', 'to_id', 'weight'])
        df_output.to_csv(os.path.join(output_edge_dir,
                                      str(i) + '.csv'),
                         sep='\t',
                         index=False)
    df_nodes = pd.DataFrame(np.array(full_node_list), columns=['node'])
    df_nodes.to_csv(os.path.join(output_node_dir,
                                 str(len(node_num_list)) + '.csv'),
                    sep='\t',
                    index=False,
                    header=False)
    df_edges.to_csv(os.path.join(output_edge_dir,
                                 str(len(node_num_list)) + '.csv'),
                    sep='\t',
                    index=False)