def compute_splits(self, G, nw_name='test', train_frac=0.51, split_alg='spanning_tree', owa=True, fe_ratio=1, split_id=0, verbose=False): """ Computes true and false train and test edge splits according to the given parameters. The sets of edges computed are both stored as properties of this object and returned from the method. Parameters ---------- G : graph A NetworkX graph nw_name : basestring, optional A string indicating the name of the dataset from which this split was generated. This is required in order to keep track of the evaluation results. Default is 'test'. train_frac : float, optional The relative size (in (0.0, 1.0]) of the train set with respect to the total number of edges in the graph. Default is 0.51. split_alg : basestring, optional Indicates the algorithm used to generate the train/test splits. Options are method based on spanning tree, random edge split and naive removal and connectedness evaluation. Default is 'spanning_tree'. owa : bool, optional Encodes the belief that the network respects or not the open world assumption. Default is True. If OWA=True, false train edges can be true test edges. False edges sampled from train graph. If OWA=False, closed world is assumed so false train edges are known to be false (not in G) fe_ratio : float, optional The ratio of false to true edge to generate. Default is 1, same number as true edges. split_id : int, optional The id to be assigned to the train/test splits generated. Default is 0. verbose : bool, optional If True print progress info. Default is False. Returns ------- train_E : set The set of train edges train_false_E : set The set of false train edges test_E : set The set of test edges test_false_E : set The set of false test edges Raises ------ ValueError If the edge split algorithm is unknown. """ # Compute train/test split if split_alg == 'random': tr_E, te_E = stt.rand_split_train_test(G, train_frac) train_E, test_E, G, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed()) elif split_alg == 'naive': train_E, test_E = stt.naive_split_train_test(G, train_frac) elif split_alg == 'spanning_tree': train_E, test_E = stt.split_train_test(G, train_frac) elif split_alg == 'fast': train_E, test_E = stt.quick_split(G, train_frac) train_E_false, test_E_false = stt.quick_nonedges( G, train_frac, fe_ratio) else: raise ValueError('Split alg. {} unknown!'.format(split_alg)) # Compute false edges if split_alg != 'fast': num_fe_train = len(train_E) * fe_ratio num_fe_test = len(test_E) * fe_ratio if owa: train_E_false, test_E_false = stt.generate_false_edges_owa( G, train_E, test_E, num_fe_train, num_fe_test) else: train_E_false, test_E_false = stt.generate_false_edges_cwa( G, train_E, test_E, num_fe_train, num_fe_test) # Set edge sets to new values self.set_splits(train_E, train_E_false, test_E, test_E_false, directed=G.is_directed(), nw_name=nw_name, split_id=split_id, split_alg=split_alg, owa=owa, verbose=verbose) return train_E, train_E_false, test_E, test_E_false
def compute_splits(self, G, nw_name='test', train_frac=0.51, split_alg='spanning_tree', owa=True, fe_ratio=1, split_id=0, verbose=False): """ Computes sets of train and test edges and non-edges according to the given input parameters and initializes the class attributes. Parameters ---------- G : graph A NetworkX graph or digraph to compute the train test split from. nw_name : string, optional A string indicating the name of the dataset from which this split was generated. This is required in order to keep track of the evaluation results. Default is 'test'. train_frac : float, optional The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]). Default is 0.51. split_alg : string, optional A string indicating the algorithm to use for generating the train/test splits. Options are `spanning_tree`, `random`, `naive`, `fast` and `timestamp`. Default is `spanning_tree`. owa : bool, optional Encodes the belief that the network should respect or not the open world assumption. Default is True. If owa=True, train non-edges are sampled from the train graph only and can overlap with test edges. If owa=False, train non-edges are sampled from the full graph and cannot overlap with test edges. fe_ratio : float, optional The ratio of non-edges to edges to sample. For fr_ratio > 0 and < 1 less non-edges than edges will be generated. For fe_edges > 1 more non-edges than edges will be generated. Default 1, same amounts. split_id : int, optional The id to be assigned to the train/test splits generated. Default is 0. verbose : bool, optional If True print progress info. Default is False. Returns ------- train_E : set The set of train edges train_false_E : set The set of train non-edges test_E : set The set of test edges test_false_E : set The set of test non-edges Raises ------ ValueError If the edge split algorithm is unknown. """ # Compute train/test split if split_alg == 'random': tr_E, te_E = stt.rand_split_train_test(G, train_frac) train_E, test_E, G, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed()) elif split_alg == 'naive': train_E, test_E = stt.naive_split_train_test(G, train_frac) elif split_alg == 'spanning_tree': train_E, test_E = stt.split_train_test(G, train_frac) elif split_alg == 'fast': train_E, test_E = stt.quick_split(G, train_frac) train_E_false, test_E_false = stt.quick_nonedges( G, train_frac, fe_ratio) elif split_alg == 'timestamp': train_E, test_E, G = stt.timestamp_split(G, train_frac) train_E = set(zip(train_E[:, 0], train_E[:, 1])) test_E = set(zip(test_E[:, 0], test_E[:, 1])) else: raise ValueError('Split alg. {} unknown!'.format(split_alg)) # Compute non-edges if split_alg != 'fast': num_fe_train = len(train_E) * fe_ratio num_fe_test = len(test_E) * fe_ratio if owa: train_E_false, test_E_false = stt.generate_false_edges_owa( G, train_E, test_E, num_fe_train, num_fe_test) else: train_E_false, test_E_false = stt.generate_false_edges_cwa( G, train_E, test_E, num_fe_train, num_fe_test) # Set class attributes to new values self.set_splits(train_E, train_E_false, test_E, test_E_false, directed=G.is_directed(), nw_name=nw_name, split_id=split_id, split_alg=split_alg, owa=owa, verbose=verbose) return train_E, train_E_false, test_E, test_E_false
def test_stt(): # Variables dataset_path = "./data/" test_name = "network.edgelist" frac = 0.5 # Load a graph G = pp.load_graph(dataset_path + test_name, delimiter=",", comments='#', directed=False) # Preprocess the graph for stt alg. SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True, maincc=True) # Split train/test using stt start = time.time() train_E, test_E = stt.split_train_test(SG, train_frac=frac) end1 = time.time() - start # Compute the false edges train_E_false, test_E_false = stt.generate_false_edges_owa( SG, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None) # Store data to file _ = stt.store_train_test_splits(dataset_path + "stt_frac_" + str(frac), train_E=train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, split_id=0) # Split train/test using rstt start = time.time() tr_E, te_E = stt.rand_split_train_test(G, train_frac=frac) end2 = time.time() - start train_E, test_E, J, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed()) print("Number of nodes in G: {}".format(len(G.nodes()))) print("Number of nodes in J: {}".format(len(J.nodes()))) print("Are nodes in J sequential integers? {}".format( not len(set(J.nodes()) - set(range(len(J.nodes())))))) checks = list() queries = 200 # Check if the mapping is correct for i in range(queries): ag = tr_E.pop() # a random element from train aj = (mp[ag[0]], mp[ag[1]]) # check what it maps to in J checks.append(aj in train_E) # print("Random tuple from G: {}".format(ag)) # print("The tuple maps in J to: {}".format(aj)) # print("Is that tuple in the new train?: {}".format(aj in train_E)) print( "For train edges out of {} samples, {} were in the relabeled train_E". format(queries, sum(checks))) checks = list() # Check if the mapping is correct for i in range(queries): ag = te_E.pop() # a random element from test aj = (mp[ag[0]], mp[ag[1]]) # check what it maps to in J checks.append(aj in test_E) # print("Random tuple from G: {}".format(ag)) # print("The tuple maps in J to: {}".format(aj)) # print("Is that tuple in the new train?: {}".format(aj in train_E)) print("For test edges out of {} samples, {} were in the relabeled test_E". format(queries, sum(checks))) # Compute the false edges train_E_false, test_E_false = stt.generate_false_edges_owa( J, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None) # Store data to file _ = stt.store_train_test_splits(dataset_path + "rstt_frac_" + str(frac), train_E=train_E, train_E_false=train_E_false, test_E=test_E, test_E_false=test_E_false, split_id=0)
def compute_splits(self, G, nw_name='test', train_frac=0.51, split_alg='spanning_tree', split_id=0, verbose=False): """ Computes sets of train and test positive and negative edges according to the given input parameters and initializes the class attributes. Parameters ---------- G : graph A NetworkX graph or digraph to compute the train test split from. nw_name : string, optional A string indicating the name of the dataset from which this split was generated. This is required in order to keep track of the evaluation results. Default is 'test'. train_frac : float, optional The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]). Default is 0.51. split_alg : string, optional A string indicating the algorithm to use for generating the train/test splits. Options are `spanning_tree`, `random`, `naive`, `fast` and `timestamp`. Default is `spanning_tree`. split_id : int, optional The id to be assigned to the train/test splits generated. Default is 0. verbose : bool, optional If True print progress info. Default is False. Returns ------- train_E : set The set of train positive edges. train_false_E : set The set of train negative edges. test_E : set The set of test positive edges. test_false_E : set The set of test negative edges. Raises ------ ValueError If the edge split algorithm is unknown. """ # Compute train/test split if split_alg == 'random': tr_E, te_E = stt.rand_split_train_test(G, train_frac) train_E, test_E, G, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed()) elif split_alg == 'naive': train_E, test_E = stt.naive_split_train_test(G, train_frac) elif split_alg == 'spanning_tree': train_E, test_E = stt.split_train_test(G, train_frac) elif split_alg == 'fast': train_E, test_E = stt.quick_split(G, train_frac) elif split_alg == 'timestamp': train_E, test_E, _ = stt.timestamp_split(G, train_frac) else: raise ValueError('Split alg. {} unknown!'.format(split_alg)) # Make sure the edges are numpy arrays train_E = np.array(list(train_E)) test_E = np.array(list(test_E)) # Get the labels of train and test a = nx.adj_matrix(G) tr_labels = np.ravel(a[train_E[:, 0], train_E[:, 1]]) te_labels = np.ravel(a[test_E[:, 0], test_E[:, 1]]) # Split train and test edges in those with positive and negative signs pos_tr_e = train_E[np.where(tr_labels == 1)[0], :] neg_tr_e = train_E[np.where(tr_labels == -1)[0], :] pos_te_e = test_E[np.where(te_labels == 1)[0], :] neg_te_e = test_E[np.where(te_labels == -1)[0], :] # Make a train graph with appropriate weights +1 / -1 H = G.copy() H.remove_edges_from(test_E) # Set class attributes to new values self.set_splits(train_E=pos_tr_e, train_E_false=neg_tr_e, test_E=pos_te_e, test_E_false=neg_te_e, directed=G.is_directed(), nw_name=nw_name, TG=H, split_id=split_id, split_alg=split_alg, verbose=verbose) return pos_tr_e, neg_tr_e, pos_te_e, neg_te_e