def preprocess(setup, nw_outpath, i): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph if setup.task == 'sp': G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed, datatype=int) else: G = pp.load_graph(setup.inpaths[i], delimiter=setup.separators[i], comments=setup.comments[i], directed=setup.directed, datatype=float) # Preprocess the graph if setup.task == 'lp' and setup.split_alg == 'random': G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops, maincc=False) else: G, ids = pp.prep_graph(G, relabel=setup.relabel, del_self_loops=setup.del_selfloops) # Save preprocessed graph to a file if setup.save_prep_nw: pp.save_graph(G, output_path=os.path.join(nw_outpath, 'prep_nw.edgelist'), delimiter=setup.delimiter, write_stats=setup.write_stats, write_weights=False, write_dir=True) # Return the preprocessed graph return G, ids
def prep_ws(inpath): """ Preprocess web spam graph. """ # Create an empty digraph G = nx.DiGraph() # Read the file and create the graph src = 0 f = open(inpath, 'r') for line in f: if src != 0: arr = line.split() for dst in arr: dst_id = int(dst.split(':')[0]) # We consider the graph unweighted G.add_edge(src, dst_id) src += 1 # G.add_node(src-2) # Preprocess the graph G, ids = pp.prep_graph(G, relabel=True, del_self_loops=False) # Return the preprocessed graph return G
def prep_fb(inpath): """ Preprocess facebook wall post graph. """ # Load a graph G = pp.load_graph(inpath, delimiter='\t', comments='#', directed=True) # The FB graph is stores as destination, origin so needs to be reversed G = G.reverse() # Preprocess the graph G, ids = pp.prep_graph(G, relabel=True, del_self_loops=False) # Return the preprocessed graph return G
def test_split(): # Variables dataset_path = "./data/" test_name = "network.edgelist" # Load a graph SG = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=False) # Preprocess the graph SG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True) print("Number of CCs input: {}".format(nx.number_connected_components(SG))) # Store the edges in the graphs as a set E E = set(SG.edges()) # Use LERW approach to get the ST start = time.time() train_lerw = stt.wilson_alg(SG, E) end1 = time.time() - start # Use BRO approach to get the ST start = time.time() train_bro = stt.broder_alg(SG, E) end2 = time.time() - start print("LERW time: {}".format(end1)) print("Bro time: {}".format(end2)) print("Num tr_e lerw: {}".format(len(train_lerw))) print("Num tr_e bro: {}".format(len(train_bro))) print("All tr_e in E for lerw?: {}".format(train_lerw - E)) print("All tr_e in E for bro?: {}".format(train_bro - E)) # Check that the graph generated with lerw has indeed one single cc TG_lerw = nx.Graph() TG_lerw.add_edges_from(train_lerw) print("Number of CCs with lerw: {}".format( nx.number_connected_components(TG_lerw))) # Check that the graph generated with broder algorithm has indeed one single cc TG_bro = nx.Graph() TG_bro.add_edges_from(train_bro) print("Number of CCs with lerw: {}".format( nx.number_connected_components(TG_bro)))
def preprocess(inpath, outpath, delimiter, directed, relabel, del_self_loops): """ Graph preprocessing routine. """ print('Preprocessing graph...') # Load a graph G = pp.load_graph(inpath, delimiter=delimiter, comments='#', directed=directed) # Preprocess the graph G, ids = pp.prep_graph(G, relabel=relabel, del_self_loops=del_self_loops) # Store preprocessed graph to a file pp.save_graph(G, output_path=outpath + "prep_graph.edgelist", delimiter=' ', write_stats=False) # Return the preprocessed graph return G
def test(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=',', comments='#', directed=True) # Print some stats print("") print("Original graph stats:") print("-----------------------------------------") pp.get_stats(G) # Save the graph pp.save_graph(G, output_path + "orig_graph.edgelist", delimiter=",") # Load the saved graph G2 = pp.load_graph(output_path + "orig_graph.edgelist", delimiter=",", comments='#', directed=True) # Stats comparison print("Has the same stats after being loaded?:") print("-----------------------------------------") pp.get_stats(G2) # Preprocess the graph GP, ids = pp.prep_graph(G2, del_self_loops=False, relabel=True) print("Preprocessed graph stats (restricted to main cc):") print("-----------------------------------------") pp.get_stats(GP) pp.save_graph(GP, output_path + "prep_graph.edgelist", delimiter=",") print("Sample of 10 (oldNodeID, newNodeID):") print("-----------------------------------------") print(ids[0:10]) pp.get_redges_false(GP, output_path + "redges_false.csv")
def run_test(): random.seed(42) np.random.seed(42) # Set some variables filename = "./data/network.edgelist" directed = False # Load the test graph G = pp.load_graph(filename, delimiter=",", comments='#', directed=directed) G, ids = pp.prep_graph(G) # Print some stars about the graph pp.get_stats(G) # Generate one train/test split with all edges in train set start = time() traintest_split = split.EvalSplit() traintest_split.compute_splits(G, train_frac=0.9) end = time() - start print("\nSplits computed in {} sec".format(end)) # Create an evaluator nee = evaluator.LPEvaluator(traintest_split) # Test baselines start = time() test_baselines(nee, directed) end = time() - start print("\nBaselines computed in {} sec".format(end)) # Test Katz start = time() test_katz(nee) end = time() - start print("\nKatz computed in {} sec".format(end))
# Author: Mara Alexandru Cristian # Contact: [email protected] # Date: 18/12/2018 # This simple example is the one presented in the README.md file. # Network reconstruction and sign prediction can be computed in the same manner by simply substituting LPEvaluator and # LPEvalSplit by NREvaluator and NREvalSplit or SPEvaluator and SPEvalSplit. from evalne.evaluation.evaluator import LPEvaluator from evalne.evaluation.score import Scoresheet from evalne.evaluation.split import LPEvalSplit from evalne.utils import preprocess as pp # Load and preprocess the network G = pp.load_graph('../../evalne/tests/data/network.edgelist') G, _ = pp.prep_graph(G) # Create an evaluator and generate train/test edge split traintest_split = LPEvalSplit() traintest_split.compute_splits(G) nee = LPEvaluator(traintest_split) # Create a Scoresheet to store the results scoresheet = Scoresheet() # Set the baselines methods = ['random_prediction', 'common_neighbours', 'jaccard_coefficient'] # Evaluate baselines for method in methods: result = nee.evaluate_baseline(method=method)
def test_stt(): # Variables dataset_path = "./data/" test_name = "network.edgelist" frac = 0.5 # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=False) # Preprocess the graph for stt alg. SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True, maincc=True) # Split train/test using stt start = time.time() train_E, test_E = stt.split_train_test(SG, train_frac=frac) end1 = time.time() - start # Compute the false edges train_E_false, test_E_false = stt.generate_false_edges_owa( SG, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None) # Store data to file _ = stt.store_train_test_splits(dataset_path + "stt_frac_" + str(frac), train_E=train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, split_id=0) # Split train/test using rstt start = time.time() tr_E, te_E = stt.rand_split_train_test(G, train_frac=frac) end2 = time.time() - start train_E, test_E, J, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed()) print("Number of nodes in G: {}".format(len(G.nodes()))) print("Number of nodes in J: {}".format(len(J.nodes()))) print("Are nodes in J sequential integers? {}".format( not len(set(J.nodes()) - set(range(len(J.nodes())))))) checks = list() queries = 200 # Check if the mapping is correct for i in range(queries): ag = tr_E.pop() # a random element from train aj = (mp[ag[0]], mp[ag[1]]) # check what it maps to in J checks.append(aj in train_E) # print("Random tuple from G: {}".format(ag)) # print("The tuple maps in J to: {}".format(aj)) # print("Is that tuple in the new train?: {}".format(aj in train_E)) print( "For train edges out of {} samples, {} were in the relabeled train_E". format(queries, sum(checks))) checks = list() # Check if the mapping is correct for i in range(queries): ag = te_E.pop() # a random element from test aj = (mp[ag[0]], mp[ag[1]]) # check what it maps to in J checks.append(aj in test_E) # print("Random tuple from G: {}".format(ag)) # print("The tuple maps in J to: {}".format(aj)) # print("Is that tuple in the new train?: {}".format(aj in train_E)) print("For test edges out of {} samples, {} were in the relabeled test_E". format(queries, sum(checks))) # Compute the false edges train_E_false, test_E_false = stt.generate_false_edges_owa( J, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None) # Store data to file _ = stt.store_train_test_splits(dataset_path + "rstt_frac_" + str(frac), train_E=train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, split_id=0)
def test_split(): # Variables dataset_path = "./data/" output_path = "./data/" test_name = "network.edgelist" subgraph_size = 400 train_frac = 0.5 directed = True # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=directed) # Restrict graph to a sub-graph of 'subgraph_size' nodes SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy() # Preprocess the graph PSG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True, maincc=True) # Save the preprocessed graph pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",") # Compute train/test splits start = time.time() train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac) end = time.time() - start print("Exec time stt: {}".format(end)) # Check that the train graph generated with stt has one single cc if directed: TG_stt = nx.DiGraph() TG_stt.add_edges_from(train_stt) print("Number of weakly CCs with stt: {}".format( nx.number_weakly_connected_components(TG_stt))) else: TG_stt = nx.Graph() TG_stt.add_edges_from(train_stt) print("Number of CCs with stt: {}".format( nx.number_connected_components(TG_stt))) print("Number train edges stt: {}".format(len(train_stt))) print("Number test edges stt: {}".format(len(test_stt))) print("Number of nodes in train graph: {}".format(len(TG_stt.nodes))) # Preprocess the graph PSG, ids = pp.prep_graph(SG, relabel=True, del_self_loops=True, maincc=False) # Compute train/test splits start = time.time() train_rstt, test_rstt = stt.rand_split_train_test(PSG, train_frac=train_frac) end = time.time() - start print("\nExec time rand_stt: {}".format(end)) # Check that the train graph generated with rstt has one single cc if directed: TG_rstt = nx.DiGraph() TG_rstt.add_edges_from(train_rstt) print("Number of weakly CCs with rstt: {}".format( nx.number_weakly_connected_components(TG_rstt))) else: TG_rstt = nx.Graph() TG_rstt.add_edges_from(train_rstt) print("Number of CCs with rstt: {}".format( nx.number_connected_components(TG_rstt))) print("Number train edges rstt: {}".format(len(train_rstt))) print("Number test edges rstt: {}".format(len(test_rstt))) print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
# --------------- # Preprocess data # --------------- # Load the data as a directed graph G = pp.load_graph(dataset_path, delimiter=",", comments='#', directed=directed) # Get some graph statistics pp.get_stats(G) # Or store them to a file pp.get_stats(G, os.path.join(output_path, "stats.txt")) # Preprocess the graph SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True) # Get non-edges so that the reversed edge exists in the graph if directed: redges = pp.get_redges_false(SG, output_path=os.path.join(output_path, "redges.csv")) # Store the graph to a file pp.save_graph(SG, output_path=os.path.join(output_path, "network_prep.edgelist"), delimiter=',', write_stats=True) # ---------------- # Split train test # ---------------- # Compute train/test splits and false edges in parallel stt.compute_splits_parallel(SG, os.path.join(traintest_path, "network_prep_51"), owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None, num_splits=5)
def timestamp_split(G, train_frac=0.51): """ Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using edge timestamps (see Notes). The resulting train edge set has the following properties: spans a graph (digraph) with a single connected (weakly connected) component. Parameters ---------- G : graph A NetworkX graph or digraph where edge weights are timestamps. train_frac : float, optional The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]). Default is 0.51. Returns ------- train_E : ndarray Column vector of train edges as pairs src, dst. test_E : ndarray Column vector of test edges as pairs src, dst. tg : graph A NetworkX graph containing only the edges in the train edge set. Raises ------ ValueError If the train_frac parameter is not in range (0, 1]. If the input graph G has more than one (weakly) connected component. Notes ----- The method proceeds as follows: (1) sort all edges by timestamp. (2) randomly remove 1-train_frac percent of edges from the input graph. (3) from the remaining edges compute the main connected component and these will be the train edges. (4) from the set of removed edges, those such that both end nodes exist in the train edge set computed in the previous step, are added to the final test set. """ # Sanity check to make sure the input is correct _sanity_check(G) if train_frac <= 0.0 or train_frac > 1.0: raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]') if train_frac == 1.0: return set(G.edges()), set() # Get Adj matrix if nx.is_directed(G): a = nx.adj_matrix(G) else: a = triu(nx.adj_matrix(G), k=1) # Argsort data and compute the idx where we split train from test ordered = np.argsort(a.data) split_idx = int(len(ordered) * train_frac) - 1 # Mask train edges and get all possible edges mask_tr = ordered > split_idx nz = a.nonzero() # Use the mask to select only train and test from nz # There will be no overlap between tr and te because nz contains only unique pairs tr_e = np.array((nz[0][~mask_tr], nz[1][~mask_tr])).T te_e = np.array((nz[0][mask_tr], nz[1][mask_tr])).T # Taking the most recent edges for testing can cause train to be disconnected so make sure it isn't tg = nx.Graph() tg.add_edges_from(tr_e) tg, ids = pp.prep_graph(tg, relabel=True, del_self_loops=True, maincc=True) tr_e = np.array(tg.edges) d = dict(ids) te_e = set(zip(te_e[:, 0], te_e[:, 1])) nte_e = map(lambda x: (d.get(x[0], -1), d.get(x[1], -1)), te_e) te_e = np.array(nte_e) # We now only keep the test edges between nodes in tg # Remove nodes that are in test but not train newn = np.setdiff1d(np.unique(te_e), np.unique(tr_e)) mask = np.isin(te_e, newn).sum(axis=1).astype(bool) te_e = te_e[~mask, :] # Return the sets of edges return tr_e, te_e, tg