def preprocess(setup, nw_outpath, i): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph if setup.task == 'sp': G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed, datatype=int) else: G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed, datatype=float) # Preprocess the graph if setup.task == 'lp' and setup.split_alg == 'random': G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops, maincc=False) else: G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops) # Save preprocessed graph to a file if setup.save_prep_nw: pp.save_graph(G, output_path=os.path.join(nw_outpath, 'prep_nw.edgelist'), delimiter=setup.delimiter, write_stats=setup.write_stats, write_weights=False, write_dir=True) # Return the preprocessed graph return G, ids
def test(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=',', comments='#', directed=True) # Print some stats print("") print("Original graph stats:") print("-----------------------------------------") pp.get_stats(G) # Save the graph pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",") # Load the saved graph G2 = pp.load_graph(output_path + "orig_graph.edgelist", delimiter=",", comments='#', directed=True) # Stats comparison print("Has the same stats after being loaded?:") print("-----------------------------------------") pp.get_stats(G2) # Preprocess the graph GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True) print("Preprocessed graph stats (restricted to main cc):") print("-----------------------------------------") pp.get_stats(GP) pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",") print("Sample of 10 (oldNodeID, newNodeID):") print("-----------------------------------------") print(ids[0:10]) pp.get_redges_false(GP, output_path + "redges_false.csv")
def prep_fb(inpath): """ Preprocess facebook wall post graph. """ # Load a graph G = pp.load_graph(inpath, delimiter='\t', comments='#', directed=True) # The FB graph is stores as destination, origin so needs to be reversed G = G.reverse() # Preprocess the graph G, ids = pp.prep_graph(G, relabel=True, del_self_loops=False) # Return the preprocessed graph return G
def test_split(): # Variables dataset_path = "./data/" test_name = "network.edgelist" # Load a graph SG = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=False) # Preprocess the graph SG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True) print("Number of CCs input: {}".format(nx.number_connected_components(SG))) # Store the edges in the graphs as a set E E = set(SG.edges()) # Use LERW approach to get the ST start = time.time() train_lerw = stt.wilson_alg(SG, E) end1 = time.time() - start # Use BRO approach to get the ST start = time.time() train_bro = stt.broder_alg(SG, E) end2 = time.time() - start print("LERW time: {}".format(end1)) print("Bro time: {}".format(end2)) print("Num tr_e lerw: {}".format(len(train_lerw))) print("Num tr_e bro: {}".format(len(train_bro))) print("All tr_e in E for lerw?: {}".format(train_lerw - E)) print("All tr_e in E for bro?: {}".format(train_bro - E)) # Check that the graph generated with lerw has indeed one single cc TG_lerw = nx.Graph() TG_lerw.add_edges_from(train_lerw) print("Number of CCs with lerw: {}".format( nx.number_connected_components(TG_lerw))) # Check that the graph generated with broder algorithm has indeed one single cc TG_bro = nx.Graph() TG_bro.add_edges_from(train_bro) print("Number of CCs with lerw: {}".format( nx.number_connected_components(TG_bro)))
def preprocess(inpath, outpath, delimiter, directed, relabel, del_self_loops): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed) # Preprocess the graph G, ids = pp.prep_graph(G, relabel=relabel, del_self_loops=del_self_loops) # Store preprocessed graph to a file pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=' ', write_stats=False) # Return the preprocessed graph return G
def run_test(): random.seed(42) np.random.seed(42) # Set some variables filename = "./data/network.edgelist" directed = False # Load the test graph G = pp.load_graph(filename, delimiter=",", comments='#', directed=directed) G, ids = pp.prep_graph(G) # Print some stars about the graph pp.get_stats(G) # Generate one train/test split with all edges in train set start = time() traintest_split = split.EvalSplit() traintest_split.compute_splits(G, train_frac=0.9) end = time() - start print("\nSplits computed in {} sec".format(end)) # Create an evaluator nee = evaluator.LPEvaluator(traintest_split) # Test baselines start = time() test_baselines(nee, directed) end = time() - start print("\nBaselines computed in {} sec".format(end)) # Test Katz start = time() test_katz(nee) end = time() - start print("\nKatz computed in {} sec".format(end))
# -*- coding: utf-8 -*- # Author: Mara Alexandru Cristian # Contact: [email protected] # Date: 18/12/2018 # This simple example is the one presented in the README.md file. # Network reconstruction and sign prediction can be computed in the same manner by simply substituting LPEvaluator and # LPEvalSplit by NREvaluator and NREvalSplit or SPEvaluator and SPEvalSplit. from evalne.evaluation.evaluator import LPEvaluator from evalne.evaluation.score import Scoresheet from evalne.evaluation.split import LPEvalSplit from evalne.utils import preprocess as pp # Load and preprocess the network G = pp.load_graph('../../evalne/tests/data/network.edgelist') G, _ = pp.prep_graph(G) # Create an evaluator and generate train/test edge split traintest_split = LPEvalSplit() traintest_split.compute_splits(G) nee = LPEvaluator(traintest_split) # Create a Scoresheet to store the results scoresheet = Scoresheet() # Set the baselines methods = ['random_prediction', 'common_neighbours', 'jaccard_coefficient'] # Evaluate baselines for method in methods:
def test_stt(): # Variables dataset_path = "./data/" test_name = "network.edgelist" frac = 0.5 # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=False) # Preprocess the graph for stt alg. SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True, maincc=True) # Split train/test using stt start = time.time() train_E, test_E = stt.split_train_test(SG, train_frac=frac) end1 = time.time() - start # Compute the false edges train_E_false, test_E_false = stt.generate_false_edges_owa( SG, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None) # Store data to file _ = stt.store_train_test_splits(dataset_path + "stt_frac_" + str(frac), train_E=train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, split_id=0) # Split train/test using rstt start = time.time() tr_E, te_E = stt.rand_split_train_test(G, train_frac=frac) end2 = time.time() - start train_E, test_E, J, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed()) print("Number of nodes in G: {}".format(len(G.nodes()))) print("Number of nodes in J: {}".format(len(J.nodes()))) print("Are nodes in J sequential integers? {}".format( not len(set(J.nodes()) - set(range(len(J.nodes())))))) checks = list() queries = 200 # Check if the mapping is correct for i in range(queries): ag = tr_E.pop() # a random element from train aj = (mp[ag[0]], mp[ag[1]]) # check what it maps to in J checks.append(aj in train_E) # print("Random tuple from G: {}".format(ag)) # print("The tuple maps in J to: {}".format(aj)) # print("Is that tuple in the new train?: {}".format(aj in train_E)) print( "For train edges out of {} samples, {} were in the relabeled train_E". format(queries, sum(checks))) checks = list() # Check if the mapping is correct for i in range(queries): ag = te_E.pop() # a random element from test aj = (mp[ag[0]], mp[ag[1]]) # check what it maps to in J checks.append(aj in test_E) # print("Random tuple from G: {}".format(ag)) # print("The tuple maps in J to: {}".format(aj)) # print("Is that tuple in the new train?: {}".format(aj in train_E)) print("For test edges out of {} samples, {} were in the relabeled test_E". format(queries, sum(checks))) # Compute the false edges train_E_false, test_E_false = stt.generate_false_edges_owa( J, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None) # Store data to file _ = stt.store_train_test_splits(dataset_path + "rstt_frac_" + str(frac), train_E=train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, split_id=0)
def test_split(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" subgraph_size = 400 train_frac = 0.5 directed = True # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=directed) # Restrict graph to a sub-graph of 'subgraph_size' nodes SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy() # Preprocess the graph PSG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True, maincc=True) # Save the preprocessed graph pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",") # Compute train/test splits start = time.time() train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac) end = time.time() - start print("Exec time stt: {}".format(end)) # Check that the train graph generated with stt has one single cc if directed: TG_stt = nx.DiGraph() TG_stt.add_edges_from(train_stt) print("Number of weakly CCs with stt: {}".format( nx.number_weakly_connected_components(TG_stt))) else: TG_stt = nx.Graph() TG_stt.add_edges_from(train_stt) print("Number of CCs with stt: {}".format( nx.number_connected_components(TG_stt))) print("Number train edges stt: {}".format(len(train_stt))) print("Number test edges stt: {}".format(len(test_stt))) print("Number of nodes in train graph: {}".format(len(TG_stt.nodes))) # Preprocess the graph PSG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True, maincc=False) # Compute train/test splits start = time.time() train_rstt, test_rstt = stt.rand_split_train_test(PSG, train_frac=train_frac) end = time.time() - start print("\nExec time rand_stt: {}".format(end)) # Check that the train graph generated with rstt has one single cc if directed: TG_rstt = nx.DiGraph() TG_rstt.add_edges_from(train_rstt) print("Number of weakly CCs with rstt: {}".format( nx.number_weakly_connected_components(TG_rstt))) else: TG_rstt = nx.Graph() TG_rstt.add_edges_from(train_rstt) print("Number of CCs with rstt: {}".format( nx.number_connected_components(TG_rstt))) print("Number train edges rstt: {}".format(len(train_rstt))) print("Number test edges rstt: {}".format(len(test_rstt))) print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
from evalne.evaluation.evaluator import LPEvaluator from evalne.evaluation.split import EvalSplit from evalne.evaluation.score import Scoresheet from evalne.utils import preprocess as pp # Load and preprocess the network #G = pp.load_graph('evalne/tests/data/network.edgelist') G = pp.load_graph( '../Graph_Conv_Neural_Nets/generic_datasets/Zachary-Karate/Zachary-Karate.edgelist' ) G, _ = pp.prep_graph(G) # Create an evaluator and generate train/test edge split traintest_split = EvalSplit( ) # Bhevencious: EvalSplit() contains methods used to READ/SET a variety of properties/variables. Use the DOT & PARANTHESIS helpers to access parameters. traintest_split.compute_splits(G, nw_name='Zachary-Karate.edgelist', train_frac=0.8) nee = LPEvaluator(traintest_split) # Create a Scoresheet to store the results scoresheet = Scoresheet() # Set the baselines methods = [ 'adamic_adar_index', 'common_neighbours', 'jaccard_coefficient', 'katz', 'preferential_attachment', 'resource_allocation_index', 'random_prediction' ] # Evaluate baselines for method in methods:
start = time() # Create folders for the results if these do not exist if not os.path.exists(output_path): os.makedirs(output_path) traintest_path = os.path.join(output_path, "lp_train_test_splits") if not os.path.exists(traintest_path): os.makedirs(traintest_path) # --------------- # Preprocess data # --------------- # Load the data as a directed graph G = pp.load_graph(dataset_path, delimiter=",", comments='#', directed=directed) # Get some graph statistics pp.get_stats(G) # Or store them to a file pp.get_stats(G, os.path.join(output_path, "stats.txt")) # Preprocess the graph SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True) # Get non-edges so that the reversed edge exists in the graph if directed: redges = pp.get_redges_false(SG, output_path=os.path.join(output_path, "redges.csv")) # Store the graph to a file