Esempio n. 1
0
def main():
    # Check cmd args
    if len(argv) != 3:
        print("ERROR: wrong number of parameters")
        print("Usage: prep_data_prune.py <facebook_path> <webspam_path>")
        exit(-1)

    # Extract the dataset names and paths
    fb_path, fb_name = os.path.split(argv[1])
    ws_path, ws_name = os.path.split(argv[2])

    # Preprocess FB graph
    G1 = prep_fb(argv[1])

    # Store FB graph to a file
    pp.save_graph(G1,
                  output_path=fb_path + "/prep_graph_slfloops.edgelist",
                  delimiter=',',
                  write_stats=True)

    # Preprocess WS graph
    G2 = prep_ws(argv[2])

    # Store preprocessed graph to a file
    pp.save_graph(G2,
                  output_path=ws_path + "/prep_graph_slfloops.edgelist",
                  delimiter=',',
                  write_stats=True)

    print("Preprocessing finished.")
Esempio n. 2
0
def test_split():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"
    subgraph_size = 1000

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter="\t",
                      comments='#',
                      directed=True)

    # Restrict graph to a sub-graph of 'subgraph_size' nodes
    SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy()

    # Preprocess the graph
    SG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True)

    # Get stats of the preprocessed subgraph
    pp.save_graph(SG, output_path + "prep_graph.edgelist", delimiter=",")

    # Alternatively, train/test splits can be computed one at a time
    train_E, test_E = stt.split_train_test(SG, train_frac=0.51, seed=99)

    print(train_E)

    # Compute set of false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        SG,
        train_E=train_E,
        test_E=test_E,
        num_fe_train=None,
        num_fe_test=None,
        seed=99)
Esempio n. 3
0
def preprocess(setup, i):
    """
    Graph preprocessing rutine.
    """
    if setup.verbose:
        print('Preprocesing graph...')

    # Load a graph
    G = pp.load_graph(setup.inpaths[i],
                      delimiter=setup.separators[i],
                      comments=setup.comments[i],
                      directed=setup.directed[i])

    # Preprocess the graph
    G, ids = pp.prep_graph(G,
                           relabel=setup.relabel,
                           del_self_loops=setup.del_selfloops)

    if setup.prep_nw_name is not None:
        # Store preprocessed graph to a file
        pp.save_graph(G,
                      output_path=setup.outpaths[i] + setup.prep_nw_name,
                      delimiter=setup.delimiter,
                      write_stats=setup.write_stats)

    # Return the preprocessed graph
    return G
Esempio n. 4
0
    def save_tr_graph(self,
                      output_path,
                      delimiter,
                      write_stats=False,
                      write_weights=False,
                      write_dir=True):
        """
        Saves the graph to a file.

        Parameters
        ----------
        output_path : file or string
            File or filename to write. If a file is provided, it must be opened in 'wb' mode.
        delimiter : string, optional
            The string used to separate values. Default is .
        write_stats : bool, optional
            Sets if graph statistics should be added to the edgelist or not. Default is False.
        write_weights : bool, optional
            If True data will be stored as weighted edgelist (e.g. triplets src, dst, weight) otherwise as normal
            edgelist. If the graph edges have no weight attribute and this parameter is set to True,
            a weight of 1 will be assigned to each edge. Default is False.
        write_dir : bool, optional
            This option is only relevant for undirected graphs. If False, the train graph will be stored with a single
            direction of the edges. If True, both directions of edges will be stored. Default is True.
        """
        pp.save_graph(self._TG,
                      output_path=output_path,
                      delimiter=delimiter,
                      write_stats=write_stats,
                      write_weights=write_weights,
                      write_dir=write_dir)
Esempio n. 5
0
def test():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=',',
                      comments='#',
                      directed=True)

    # Print some stats
    print("")
    print("Original graph stats:")
    print("-----------------------------------------")
    pp.get_stats(G)

    # Save the graph
    pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",")

    # Load the saved graph
    G2 = pp.load_graph(output_path + "orig_graph.edgelist",
                       delimiter=",",
                       comments='#',
                       directed=True)

    # Stats comparison
    print("Has the same stats after being loaded?:")
    print("-----------------------------------------")
    pp.get_stats(G2)

    # Preprocess the graph
    GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True)

    print("Preprocessed graph stats (restricted to main cc):")
    print("-----------------------------------------")
    pp.get_stats(GP)

    pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",")

    print("Sample of 10 (oldNodeID, newNodeID):")
    print("-----------------------------------------")
    print(ids[0:10])

    pp.get_redges_false(GP, output_path + "redges_false.csv")
Esempio n. 6
0
    def save_tr_graph(self, output_path, delimiter, write_stats=False):
        """
        Saves the graph to a file.

        Parameters
        ----------
        output_path : file or string
            File or filename to write. If a file is provided, it must be opened in 'wb' mode.
        delimiter : string, optional
            The string used to separate values. Default is .
        write_stats : bool, optional
            Sets if graph statistics should be added to the edgelist or not. Default is False.
        """
        pp.save_graph(self._TG,
                      output_path=output_path,
                      delimiter=delimiter,
                      write_stats=write_stats)
Esempio n. 7
0
def preprocess(inpath, outpath, delimiter, directed):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed)

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=True, del_self_loops=True)

    # Store preprocessed graph to a file
    pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=',', write_stats=True)

    # Return the preprocessed graph
    return G
Esempio n. 8
0
# Get some graph statistics
pp.get_stats(G)

# Or store them to a file
pp.get_stats(G, output_path + "stats.txt")

# Preprocess the graph
SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True)

# Get non-edges so that the reversed edge exists in the graph
if directed:
    redges = pp.get_redges_false(SG, output_path=output_path + "redges.csv")

# Store the graph to a file
pp.save_graph(SG,
              output_path=output_path + "network_prep.edgelist",
              delimiter=',',
              write_stats=True)

# ----------------
# Split train test
# ----------------

# Compute train/test splits and false edges in parallel
stt.compute_splits_parallel(SG,
                            output_path +
                            "lp_train_test_splits/network_prep_51",
                            owa=True,
                            train_frac=0.51,
                            num_fe_train=None,
                            num_fe_test=None,
                            num_splits=5)