Ejemplo n.º 1
0
    def save_tr_graph(self,
                      output_path,
                      delimiter,
                      write_stats=False,
                      write_weights=False,
                      write_dir=True):
        """
        Saves the graph to a file.

        Parameters
        ----------
        output_path : file or string
            File or filename to write. If a file is provided, it must be opened in 'wb' mode.
        delimiter : string, optional
            The string used to separate values. Default is .
        write_stats : bool, optional
            Sets if graph statistics should be added to the edgelist or not. Default is False.
        write_weights : bool, optional
            If True data will be stored as weighted edgelist (e.g. triplets src, dst, weight) otherwise as normal
            edgelist. If the graph edges have no weight attribute and this parameter is set to True,
            a weight of 1 will be assigned to each edge. Default is False.
        write_dir : bool, optional
            This option is only relevant for undirected graphs. If False, the train graph will be stored with a single
            direction of the edges. If True, both directions of edges will be stored. Default is True.
        """
        pp.save_graph(self._TG,
                      output_path=output_path,
                      delimiter=delimiter,
                      write_stats=write_stats,
                      write_weights=write_weights,
                      write_dir=write_dir)
Ejemplo n.º 2
0
def main():
    # Check cmd args
    if len(argv) != 3:
        print("ERROR: wrong number of parameters")
        print("Usage: prep_data_prune.py <facebook_path> <webspam_path>")
        exit(-1)

    # Extract the dataset names and paths
    fb_path, fb_name = os.path.split(argv[1])
    ws_path, ws_name = os.path.split(argv[2])

    # Preprocess FB graph
    G1 = prep_fb(argv[1])

    # Store FB graph to a file
    pp.save_graph(G1,
                  output_path=fb_path + "/prep_graph_slfloops.edgelist",
                  delimiter=',',
                  write_stats=True)

    # Preprocess WS graph
    G2 = prep_ws(argv[2])

    # Store preprocessed graph to a file
    pp.save_graph(G2,
                  output_path=ws_path + "/prep_graph_slfloops.edgelist",
                  delimiter=',',
                  write_stats=True)

    print("Preprocessing finished.")
Ejemplo n.º 3
0
def preprocess(setup, nw_outpath, i):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    if setup.task == 'sp':
        G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i],
                          directed=setup.directed, datatype=int)
    else:
        G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i],
                          directed=setup.directed, datatype=float)

    # Preprocess the graph
    if setup.task == 'lp' and setup.split_alg == 'random':
        G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops, maincc=False)
    else:
        G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops)

    # Save preprocessed graph to a file
    if setup.save_prep_nw:
        pp.save_graph(G, output_path=os.path.join(nw_outpath, 'prep_nw.edgelist'), delimiter=setup.delimiter,
                      write_stats=setup.write_stats, write_weights=False, write_dir=True)

    # Return the preprocessed graph
    return G, ids
Ejemplo n.º 4
0
def preprocess(inpath, outpath, delimiter, directed, relabel, del_self_loops):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed)

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=relabel, del_self_loops=del_self_loops)

    # Store preprocessed graph to a file
    pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=' ', write_stats=False)

    # Return the preprocessed graph
    return G
Ejemplo n.º 5
0
def test():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=',',
                      comments='#',
                      directed=True)

    # Print some stats
    print("")
    print("Original graph stats:")
    print("-----------------------------------------")
    pp.get_stats(G)

    # Save the graph
    pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",")

    # Load the saved graph
    G2 = pp.load_graph(output_path + "orig_graph.edgelist",
                       delimiter=",",
                       comments='#',
                       directed=True)

    # Stats comparison
    print("Has the same stats after being loaded?:")
    print("-----------------------------------------")
    pp.get_stats(G2)

    # Preprocess the graph
    GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True)

    print("Preprocessed graph stats (restricted to main cc):")
    print("-----------------------------------------")
    pp.get_stats(GP)

    pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",")

    print("Sample of 10 (oldNodeID, newNodeID):")
    print("-----------------------------------------")
    print(ids[0:10])

    pp.get_redges_false(GP, output_path + "redges_false.csv")
Ejemplo n.º 6
0
    def save_tr_graph(self,
                      output_path,
                      delimiter,
                      write_stats=False,
                      write_weights=False,
                      write_dir=True):
        """
        Saves the TG graph to a file.

        Parameters
        ----------
        output_path : file or string
            File or filename to write. If a file is provided, it must be opened in 'wb' mode.
        delimiter : string, optional
            The string used to separate values. Default is ','.
        write_stats : bool, optional
            Adds basic graph statistics to the file as a header or not. Default is True.
        write_weights : bool, optional
            If True data will be stored as weighted edgelist i.e. triplets (src, dst, weight), otherwise, as regular
            (src, dst) pairs. For unweighted graphs, setting this parameter to True will add weight 1 to all edges.
            Default is False.
        write_dir : bool, optional
            This parameter is only relevant for undirected graphs. If True, it forces the method to write both edge
            directions in the file i.e. (src, dst) and (dst, src). If False, only one direction is stored.
            Default is True.

        See also
        --------
        evalne.utils.preprocess.save_graph
        """
        pp.save_graph(self._TG,
                      output_path=output_path,
                      delimiter=delimiter,
                      write_stats=write_stats,
                      write_weights=write_weights,
                      write_dir=write_dir)
Ejemplo n.º 7
0
def test_split():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"
    subgraph_size = 400
    train_frac = 0.5
    directed = True

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=directed)

    # Restrict graph to a sub-graph of 'subgraph_size' nodes
    SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy()

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=True)

    # Save the preprocessed graph
    pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",")

    # Compute train/test splits
    start = time.time()
    train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac)
    end = time.time() - start
    print("Exec time stt: {}".format(end))

    # Check that the train graph generated with stt has one single cc
    if directed:
        TG_stt = nx.DiGraph()
        TG_stt.add_edges_from(train_stt)
        print("Number of weakly CCs with stt: {}".format(
            nx.number_weakly_connected_components(TG_stt)))
    else:
        TG_stt = nx.Graph()
        TG_stt.add_edges_from(train_stt)
        print("Number of CCs with stt: {}".format(
            nx.number_connected_components(TG_stt)))
    print("Number train edges stt: {}".format(len(train_stt)))
    print("Number test edges stt: {}".format(len(test_stt)))
    print("Number of nodes in train graph: {}".format(len(TG_stt.nodes)))

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=False)

    # Compute train/test splits
    start = time.time()
    train_rstt, test_rstt = stt.rand_split_train_test(PSG,
                                                      train_frac=train_frac)
    end = time.time() - start
    print("\nExec time rand_stt: {}".format(end))

    # Check that the train graph generated with rstt has one single cc
    if directed:
        TG_rstt = nx.DiGraph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of weakly CCs with rstt: {}".format(
            nx.number_weakly_connected_components(TG_rstt)))
    else:
        TG_rstt = nx.Graph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of CCs with rstt: {}".format(
            nx.number_connected_components(TG_rstt)))
    print("Number train edges rstt: {}".format(len(train_rstt)))
    print("Number test edges rstt: {}".format(len(test_rstt)))
    print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
Ejemplo n.º 8
0
# Get some graph statistics
pp.get_stats(G)

# Or store them to a file
pp.get_stats(G, os.path.join(output_path, "stats.txt"))

# Preprocess the graph
SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True)

# Get non-edges so that the reversed edge exists in the graph
if directed:
    redges = pp.get_redges_false(SG, output_path=os.path.join(output_path, "redges.csv"))

# Store the graph to a file
pp.save_graph(SG, output_path=os.path.join(output_path, "network_prep.edgelist"), delimiter=',', write_stats=True)

# ----------------
# Split train test
# ----------------

# Compute train/test splits and false edges in parallel
stt.compute_splits_parallel(SG, os.path.join(traintest_path, "network_prep_51"), owa=True,
                            train_frac=0.51, num_fe_train=None, num_fe_test=None, num_splits=5)

# The overlap between the 5 generated sets can be easily checked
print("Overlap check for train sets: ")
stt.check_overlap(filename=os.path.join(traintest_path, "network_prep_51", "trE"), num_sets=5)
print("Overlap check for test sets: ")
stt.check_overlap(filename=os.path.join(traintest_path, "network_prep_51", "teE"), num_sets=5)