Example #1
0
def preprocess(setup, nw_outpath, i):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    if setup.task == 'sp':
        G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i],
                          directed=setup.directed, datatype=int)
    else:
        G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i],
                          directed=setup.directed, datatype=float)

    # Preprocess the graph
    if setup.task == 'lp' and setup.split_alg == 'random':
        G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops, maincc=False)
    else:
        G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops)

    # Save preprocessed graph to a file
    if setup.save_prep_nw:
        pp.save_graph(G, output_path=os.path.join(nw_outpath, 'prep_nw.edgelist'), delimiter=setup.delimiter,
                      write_stats=setup.write_stats, write_weights=False, write_dir=True)

    # Return the preprocessed graph
    return G, ids
Example #2
0
def prep_ws(inpath):
    """
    Preprocess web spam graph.
    """
    # Create an empty digraph
    G = nx.DiGraph()

    # Read the file and create the graph
    src = 0
    f = open(inpath, 'r')
    for line in f:
        if src != 0:
            arr = line.split()
            for dst in arr:
                dst_id = int(dst.split(':')[0])
                # We consider the graph unweighted
                G.add_edge(src, dst_id)
        src += 1
    # G.add_node(src-2)

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=True, del_self_loops=False)

    # Return the preprocessed graph
    return G
Example #3
0
def prep_fb(inpath):
    """
    Preprocess facebook wall post graph.
    """
    # Load a graph
    G = pp.load_graph(inpath, delimiter='\t', comments='#', directed=True)

    # The FB graph is stores as destination, origin so needs to be reversed
    G = G.reverse()

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=True, del_self_loops=False)

    # Return the preprocessed graph
    return G
Example #4
0
def test_split():
    # Variables
    dataset_path = "./data/"
    test_name = "network.edgelist"

    # Load a graph
    SG = pp.load_graph(dataset_path + test_name,
                       delimiter=",",
                       comments='#',
                       directed=False)

    # Preprocess the graph
    SG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True)
    print("Number of CCs input: {}".format(nx.number_connected_components(SG)))

    # Store the edges in the graphs as a set E
    E = set(SG.edges())

    # Use LERW approach to get the ST
    start = time.time()
    train_lerw = stt.wilson_alg(SG, E)
    end1 = time.time() - start

    # Use BRO approach to get the ST
    start = time.time()
    train_bro = stt.broder_alg(SG, E)
    end2 = time.time() - start

    print("LERW time: {}".format(end1))
    print("Bro time: {}".format(end2))

    print("Num tr_e lerw: {}".format(len(train_lerw)))
    print("Num tr_e bro: {}".format(len(train_bro)))

    print("All tr_e in E for lerw?: {}".format(train_lerw - E))
    print("All tr_e in E for bro?: {}".format(train_bro - E))

    # Check that the graph generated with lerw has indeed one single cc
    TG_lerw = nx.Graph()
    TG_lerw.add_edges_from(train_lerw)
    print("Number of CCs with lerw: {}".format(
        nx.number_connected_components(TG_lerw)))

    # Check that the graph generated with broder algorithm has indeed one single cc
    TG_bro = nx.Graph()
    TG_bro.add_edges_from(train_bro)
    print("Number of CCs with lerw: {}".format(
        nx.number_connected_components(TG_bro)))
Example #5
0
def preprocess(inpath, outpath, delimiter, directed, relabel, del_self_loops):
    """
    Graph preprocessing routine.
    """
    print('Preprocessing graph...')

    # Load a graph
    G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed)

    # Preprocess the graph
    G, ids = pp.prep_graph(G, relabel=relabel, del_self_loops=del_self_loops)

    # Store preprocessed graph to a file
    pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=' ', write_stats=False)

    # Return the preprocessed graph
    return G
Example #6
0
def test():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=',',
                      comments='#',
                      directed=True)

    # Print some stats
    print("")
    print("Original graph stats:")
    print("-----------------------------------------")
    pp.get_stats(G)

    # Save the graph
    pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",")

    # Load the saved graph
    G2 = pp.load_graph(output_path + "orig_graph.edgelist",
                       delimiter=",",
                       comments='#',
                       directed=True)

    # Stats comparison
    print("Has the same stats after being loaded?:")
    print("-----------------------------------------")
    pp.get_stats(G2)

    # Preprocess the graph
    GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True)

    print("Preprocessed graph stats (restricted to main cc):")
    print("-----------------------------------------")
    pp.get_stats(GP)

    pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",")

    print("Sample of 10 (oldNodeID, newNodeID):")
    print("-----------------------------------------")
    print(ids[0:10])

    pp.get_redges_false(GP, output_path + "redges_false.csv")
Example #7
0
def run_test():

    random.seed(42)
    np.random.seed(42)

    # Set some variables
    filename = "./data/network.edgelist"
    directed = False

    # Load the test graph
    G = pp.load_graph(filename, delimiter=",", comments='#', directed=directed)
    G, ids = pp.prep_graph(G)

    # Print some stars about the graph
    pp.get_stats(G)

    # Generate one train/test split with all edges in train set
    start = time()
    traintest_split = split.EvalSplit()
    traintest_split.compute_splits(G, train_frac=0.9)
    end = time() - start
    print("\nSplits computed in {} sec".format(end))

    # Create an evaluator
    nee = evaluator.LPEvaluator(traintest_split)

    # Test baselines
    start = time()
    test_baselines(nee, directed)
    end = time() - start
    print("\nBaselines computed in {} sec".format(end))

    # Test Katz
    start = time()
    test_katz(nee)
    end = time() - start
    print("\nKatz computed in {} sec".format(end))
Example #8
0
# Author: Mara Alexandru Cristian
# Contact: [email protected]
# Date: 18/12/2018

# This simple example is the one presented in the README.md file.
# Network reconstruction and sign prediction can be computed in the same manner by simply substituting LPEvaluator and
# LPEvalSplit by NREvaluator and NREvalSplit or SPEvaluator and SPEvalSplit.

from evalne.evaluation.evaluator import LPEvaluator
from evalne.evaluation.score import Scoresheet
from evalne.evaluation.split import LPEvalSplit
from evalne.utils import preprocess as pp

# Load and preprocess the network
G = pp.load_graph('../../evalne/tests/data/network.edgelist')
G, _ = pp.prep_graph(G)

# Create an evaluator and generate train/test edge split
traintest_split = LPEvalSplit()
traintest_split.compute_splits(G)
nee = LPEvaluator(traintest_split)

# Create a Scoresheet to store the results
scoresheet = Scoresheet()

# Set the baselines
methods = ['random_prediction', 'common_neighbours', 'jaccard_coefficient']

# Evaluate baselines
for method in methods:
    result = nee.evaluate_baseline(method=method)
Example #9
0
def test_stt():
    # Variables
    dataset_path = "./data/"
    test_name = "network.edgelist"
    frac = 0.5

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=False)

    # Preprocess the graph for stt alg.
    SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True, maincc=True)

    # Split train/test using stt
    start = time.time()
    train_E, test_E = stt.split_train_test(SG, train_frac=frac)
    end1 = time.time() - start

    # Compute the false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        SG,
        train_E=train_E,
        test_E=test_E,
        num_fe_train=None,
        num_fe_test=None)
    # Store data to file
    _ = stt.store_train_test_splits(dataset_path + "stt_frac_" + str(frac),
                                    train_E=train_E,
                                    train_E_false=train_E_false,
                                    test_E=test_E,
                                    test_E_false=test_E_false,
                                    split_id=0)

    # Split train/test using rstt
    start = time.time()
    tr_E, te_E = stt.rand_split_train_test(G, train_frac=frac)
    end2 = time.time() - start

    train_E, test_E, J, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed())

    print("Number of nodes in G: {}".format(len(G.nodes())))
    print("Number of nodes in J: {}".format(len(J.nodes())))
    print("Are nodes in J sequential integers? {}".format(
        not len(set(J.nodes()) - set(range(len(J.nodes()))))))

    checks = list()
    queries = 200
    # Check if the mapping is correct
    for i in range(queries):
        ag = tr_E.pop()  # a random element from train
        aj = (mp[ag[0]], mp[ag[1]])  # check what it maps to in J
        checks.append(aj in train_E)
        # print("Random tuple from G: {}".format(ag))
        # print("The tuple maps in J to: {}".format(aj))
        # print("Is that tuple in the new train?: {}".format(aj in train_E))

    print(
        "For train edges out of {} samples, {} were in the relabeled train_E".
        format(queries, sum(checks)))

    checks = list()
    # Check if the mapping is correct
    for i in range(queries):
        ag = te_E.pop()  # a random element from test
        aj = (mp[ag[0]], mp[ag[1]])  # check what it maps to in J
        checks.append(aj in test_E)
        # print("Random tuple from G: {}".format(ag))
        # print("The tuple maps in J to: {}".format(aj))
        # print("Is that tuple in the new train?: {}".format(aj in train_E))

    print("For test edges out of {} samples, {} were in the relabeled test_E".
          format(queries, sum(checks)))

    # Compute the false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        J, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None)
    # Store data to file
    _ = stt.store_train_test_splits(dataset_path + "rstt_frac_" + str(frac),
                                    train_E=train_E,
                                    train_E_false=train_E_false,
                                    test_E=test_E,
                                    test_E_false=test_E_false,
                                    split_id=0)
Example #10
0
def test_split():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"
    subgraph_size = 400
    train_frac = 0.5
    directed = True

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=directed)

    # Restrict graph to a sub-graph of 'subgraph_size' nodes
    SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy()

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=True)

    # Save the preprocessed graph
    pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",")

    # Compute train/test splits
    start = time.time()
    train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac)
    end = time.time() - start
    print("Exec time stt: {}".format(end))

    # Check that the train graph generated with stt has one single cc
    if directed:
        TG_stt = nx.DiGraph()
        TG_stt.add_edges_from(train_stt)
        print("Number of weakly CCs with stt: {}".format(
            nx.number_weakly_connected_components(TG_stt)))
    else:
        TG_stt = nx.Graph()
        TG_stt.add_edges_from(train_stt)
        print("Number of CCs with stt: {}".format(
            nx.number_connected_components(TG_stt)))
    print("Number train edges stt: {}".format(len(train_stt)))
    print("Number test edges stt: {}".format(len(test_stt)))
    print("Number of nodes in train graph: {}".format(len(TG_stt.nodes)))

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=False)

    # Compute train/test splits
    start = time.time()
    train_rstt, test_rstt = stt.rand_split_train_test(PSG,
                                                      train_frac=train_frac)
    end = time.time() - start
    print("\nExec time rand_stt: {}".format(end))

    # Check that the train graph generated with rstt has one single cc
    if directed:
        TG_rstt = nx.DiGraph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of weakly CCs with rstt: {}".format(
            nx.number_weakly_connected_components(TG_rstt)))
    else:
        TG_rstt = nx.Graph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of CCs with rstt: {}".format(
            nx.number_connected_components(TG_rstt)))
    print("Number train edges rstt: {}".format(len(train_rstt)))
    print("Number test edges rstt: {}".format(len(test_rstt)))
    print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
Example #11
0
# ---------------
# Preprocess data
# ---------------

# Load the data as a directed graph
G = pp.load_graph(dataset_path, delimiter=",", comments='#', directed=directed)

# Get some graph statistics
pp.get_stats(G)

# Or store them to a file
pp.get_stats(G, os.path.join(output_path, "stats.txt"))

# Preprocess the graph
SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True)

# Get non-edges so that the reversed edge exists in the graph
if directed:
    redges = pp.get_redges_false(SG, output_path=os.path.join(output_path, "redges.csv"))

# Store the graph to a file
pp.save_graph(SG, output_path=os.path.join(output_path, "network_prep.edgelist"), delimiter=',', write_stats=True)

# ----------------
# Split train test
# ----------------

# Compute train/test splits and false edges in parallel
stt.compute_splits_parallel(SG, os.path.join(traintest_path, "network_prep_51"), owa=True,
                            train_frac=0.51, num_fe_train=None, num_fe_test=None, num_splits=5)
Example #12
0
def timestamp_split(G, train_frac=0.51):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using edge
    timestamps (see Notes). The resulting train edge set has the following properties: spans a graph (digraph) with
    a single connected (weakly connected) component.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph where edge weights are timestamps.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.

    Returns
    -------
    train_E : ndarray
        Column vector of train edges as pairs src, dst.
    test_E : ndarray
        Column vector of test edges as pairs src, dst.
    tg : graph
        A NetworkX graph containing only the edges in the train edge set.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.

    Notes
    -----
    The method proceeds as follows: (1) sort all edges by timestamp. (2) randomly remove 1-train_frac percent of edges
    from the input graph. (3) from the remaining edges compute the main connected component and these will be the train
    edges. (4) from the set of removed edges, those such that both end nodes exist in the train edge set computed in
    the previous step, are added to the final test set.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges()), set()

    # Get Adj matrix
    if nx.is_directed(G):
        a = nx.adj_matrix(G)
    else:
        a = triu(nx.adj_matrix(G), k=1)

    # Argsort data and compute the idx where we split train from test
    ordered = np.argsort(a.data)
    split_idx = int(len(ordered) * train_frac) - 1

    # Mask train edges and get all possible edges
    mask_tr = ordered > split_idx
    nz = a.nonzero()

    # Use the mask to select only train and test from nz
    # There will be no overlap between tr and te because nz contains only unique pairs
    tr_e = np.array((nz[0][~mask_tr], nz[1][~mask_tr])).T
    te_e = np.array((nz[0][mask_tr], nz[1][mask_tr])).T

    # Taking the most recent edges for testing can cause train to be disconnected so make sure it isn't
    tg = nx.Graph()
    tg.add_edges_from(tr_e)
    tg, ids = pp.prep_graph(tg, relabel=True, del_self_loops=True, maincc=True)
    tr_e = np.array(tg.edges)

    d = dict(ids)
    te_e = set(zip(te_e[:, 0], te_e[:, 1]))
    nte_e = map(lambda x: (d.get(x[0], -1), d.get(x[1], -1)), te_e)
    te_e = np.array(nte_e)

    # We now only keep the test edges between nodes in tg
    # Remove nodes that are in test but not train
    newn = np.setdiff1d(np.unique(te_e), np.unique(tr_e))
    mask = np.isin(te_e, newn).sum(axis=1).astype(bool)
    te_e = te_e[~mask, :]

    # Return the sets of edges
    return tr_e, te_e, tg