Ejemplo n.º 1
0
    def compute_splits(self, G, nw_name='test', train_frac=0.51, split_alg='spanning_tree', owa=True, fe_ratio=1,
                       split_id=0, verbose=False):
        """
        Computes true and false train and test edge splits according to the given parameters.
        The sets of edges computed are both stored as properties of this object and returned from the method.

        Parameters
        ----------
        G : graph
            A NetworkX graph
        nw_name : basestring, optional
            A string indicating the name of the dataset from which this split was generated.
            This is required in order to keep track of the evaluation results. Default is 'test'.
        train_frac : float, optional
            The relative size (in (0.0, 1.0]) of the train set with respect to the total number of edges in the graph.
            Default is 0.51.
        split_alg : basestring, optional
            Indicates the algorithm used to generate the train/test splits. Options are method based on spanning tree,
            random edge split and naive removal and connectedness evaluation. Default is 'spanning_tree'.
        owa : bool, optional
            Encodes the belief that the network respects or not the open world assumption. Default is True.
            If OWA=True, false train edges can be true test edges. False edges sampled from train graph.
            If OWA=False, closed world is assumed so false train edges are known to be false (not in G)
        fe_ratio : float, optional
            The ratio of false to true edge to generate. Default is 1, same number as true edges.
        split_id : int, optional
            The id to be assigned to the train/test splits generated. Default is 0.
        verbose : bool, optional
            If True print progress info. Default is False.

        Returns
        -------
        train_E : set
            The set of train edges
        train_false_E : set
            The set of false train edges
        test_E : set
            The set of test edges
        test_false_E : set
            The set of false test edges

        Raises
        ------
        ValueError
            If the edge split algorithm is unknown.
        """
        # Compute train/test split
        if split_alg == 'random':
            train_E, test_E = stt.rand_split_train_test(G, train_frac)
            #train_E, test_E, G, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed())
        elif split_alg == 'naive':
            train_E, test_E = stt.naive_split_train_test(G, train_frac)
        elif split_alg == 'spanning_tree':
            train_E, test_E = stt.split_train_test(G, train_frac)
        elif split_alg == 'fast':
            train_E, test_E = stt.quick_split(G, train_frac)
            train_E_false, test_E_false = stt.quick_nonedges(G, train_frac, fe_ratio)
        else:
            raise ValueError('Split alg. {} unknown!'.format(split_alg))

        # Compute false edges
        if split_alg != 'fast':
            num_fe_train = len(train_E) * fe_ratio
            num_fe_test = len(test_E) * fe_ratio
            if owa:
                train_E_false, test_E_false = stt.generate_false_edges_owa(G, train_E, test_E,
                                                                           num_fe_train, num_fe_test)
            else:
                train_E_false, test_E_false = stt.generate_false_edges_cwa(G, train_E, test_E,
                                                                           num_fe_train, num_fe_test)

        # Set edge sets to new values
        self.set_splits(train_E, train_E_false, test_E, test_E_false, directed=G.is_directed(), nw_name=nw_name,
                        split_id=split_id, split_alg=split_alg, owa=owa, verbose=verbose)

        return train_E, train_E_false, test_E, test_E_false
Ejemplo n.º 2
0
def test_stt():
    # Variables
    dataset_path = "./data/"
    test_name = "network.edgelist"
    frac = 0.5

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=False)

    # Preprocess the graph for stt alg.
    SG, ids = pp.prep_graph(G, relabel=True, del_self_loops=True, maincc=True)

    # Split train/test using stt
    start = time.time()
    train_E, test_E = stt.split_train_test(SG, train_frac=frac)
    end1 = time.time() - start

    # Compute the false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        SG,
        train_E=train_E,
        test_E=test_E,
        num_fe_train=None,
        num_fe_test=None)
    # Store data to file
    _ = stt.store_train_test_splits(dataset_path + "stt_frac_" + str(frac),
                                    train_E=train_E,
                                    train_E_false=train_E_false,
                                    test_E=test_E,
                                    test_E_false=test_E_false,
                                    split_id=0)

    # Split train/test using rstt
    start = time.time()
    tr_E, te_E = stt.rand_split_train_test(G, train_frac=frac)
    end2 = time.time() - start

    train_E, test_E, J, mp = pp.relabel_nodes(tr_E, te_E, G.is_directed())

    print("Number of nodes in G: {}".format(len(G.nodes())))
    print("Number of nodes in J: {}".format(len(J.nodes())))
    print("Are nodes in J sequential integers? {}".format(
        not len(set(J.nodes()) - set(range(len(J.nodes()))))))

    checks = list()
    queries = 200
    # Check if the mapping is correct
    for i in range(queries):
        ag = tr_E.pop()  # a random element from train
        aj = (mp[ag[0]], mp[ag[1]])  # check what it maps to in J
        checks.append(aj in train_E)
        # print("Random tuple from G: {}".format(ag))
        # print("The tuple maps in J to: {}".format(aj))
        # print("Is that tuple in the new train?: {}".format(aj in train_E))

    print(
        "For train edges out of {} samples, {} were in the relabeled train_E".
        format(queries, sum(checks)))

    checks = list()
    # Check if the mapping is correct
    for i in range(queries):
        ag = te_E.pop()  # a random element from test
        aj = (mp[ag[0]], mp[ag[1]])  # check what it maps to in J
        checks.append(aj in test_E)
        # print("Random tuple from G: {}".format(ag))
        # print("The tuple maps in J to: {}".format(aj))
        # print("Is that tuple in the new train?: {}".format(aj in train_E))

    print("For test edges out of {} samples, {} were in the relabeled test_E".
          format(queries, sum(checks)))

    # Compute the false edges
    train_E_false, test_E_false = stt.generate_false_edges_owa(
        J, train_E=train_E, test_E=test_E, num_fe_train=None, num_fe_test=None)
    # Store data to file
    _ = stt.store_train_test_splits(dataset_path + "rstt_frac_" + str(frac),
                                    train_E=train_E,
                                    train_E_false=train_E_false,
                                    test_E=test_E,
                                    test_E_false=test_E_false,
                                    split_id=0)
Ejemplo n.º 3
0
def test_split():
    # Variables
    dataset_path = "./data/"
    output_path = "./data/"
    test_name = "network.edgelist"
    subgraph_size = 400
    train_frac = 0.5
    directed = True

    # Load a graph
    G = pp.load_graph(dataset_path + test_name,
                      delimiter=",",
                      comments='#',
                      directed=directed)

    # Restrict graph to a sub-graph of 'subgraph_size' nodes
    SG = G.subgraph(random.sample(G.nodes, subgraph_size)).copy()

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=True)

    # Save the preprocessed graph
    pp.save_graph(PSG, output_path + "prep_graph.edgelist", delimiter=",")

    # Compute train/test splits
    start = time.time()
    train_stt, test_stt = stt.split_train_test(PSG, train_frac=train_frac)
    end = time.time() - start
    print("Exec time stt: {}".format(end))

    # Check that the train graph generated with stt has one single cc
    if directed:
        TG_stt = nx.DiGraph()
        TG_stt.add_edges_from(train_stt)
        print("Number of weakly CCs with stt: {}".format(
            nx.number_weakly_connected_components(TG_stt)))
    else:
        TG_stt = nx.Graph()
        TG_stt.add_edges_from(train_stt)
        print("Number of CCs with stt: {}".format(
            nx.number_connected_components(TG_stt)))
    print("Number train edges stt: {}".format(len(train_stt)))
    print("Number test edges stt: {}".format(len(test_stt)))
    print("Number of nodes in train graph: {}".format(len(TG_stt.nodes)))

    # Preprocess the graph
    PSG, ids = pp.prep_graph(SG,
                             relabel=True,
                             del_self_loops=True,
                             maincc=False)

    # Compute train/test splits
    start = time.time()
    train_rstt, test_rstt = stt.rand_split_train_test(PSG,
                                                      train_frac=train_frac)
    end = time.time() - start
    print("\nExec time rand_stt: {}".format(end))

    # Check that the train graph generated with rstt has one single cc
    if directed:
        TG_rstt = nx.DiGraph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of weakly CCs with rstt: {}".format(
            nx.number_weakly_connected_components(TG_rstt)))
    else:
        TG_rstt = nx.Graph()
        TG_rstt.add_edges_from(train_rstt)
        print("Number of CCs with rstt: {}".format(
            nx.number_connected_components(TG_rstt)))
    print("Number train edges rstt: {}".format(len(train_rstt)))
    print("Number test edges rstt: {}".format(len(test_rstt)))
    print("Number of nodes in train graph: {}".format(len(TG_rstt.nodes)))
Ejemplo n.º 4
0
    def compute_splits(self,
                       G,
                       nw_name='test',
                       train_frac=0.51,
                       split_alg='spanning_tree',
                       owa=True,
                       fe_ratio=1,
                       split_id=0,
                       verbose=False):
        """
        Computes sets of train and test edges and non-edges according to the given input parameters and initializes
        the class attributes.

        Parameters
        ----------
        G : graph
            A NetworkX graph or digraph to compute the train test split from.
        nw_name : string, optional
            A string indicating the name of the dataset from which this split was generated.
            This is required in order to keep track of the evaluation results. Default is 'test'.
        train_frac : float, optional
            The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
            Default is 0.51.
        split_alg : string, optional
            A string indicating the algorithm to use for generating the train/test splits. Options are `spanning_tree`,
            `random`, `naive`, `fast` and `timestamp`. Default is `spanning_tree`.
        owa : bool, optional
            Encodes the belief that the network should respect or not the open world assumption. Default is True.
            If owa=True, train non-edges are sampled from the train graph only and can overlap with test edges.
            If owa=False, train non-edges are sampled from the full graph and cannot overlap with test edges.
        fe_ratio : float, optional
            The ratio of non-edges to edges to sample. For fr_ratio > 0 and < 1 less non-edges than edges will be
            generated. For fe_edges > 1 more non-edges than edges will be generated. Default 1, same amounts.
        split_id : int, optional
            The id to be assigned to the train/test splits generated. Default is 0.
        verbose : bool, optional
            If True print progress info. Default is False.

        Returns
        -------
        train_E : set
            The set of train edges
        train_false_E : set
            The set of train non-edges
        test_E : set
            The set of test edges
        test_false_E : set
            The set of test non-edges

        Raises
        ------
        ValueError
            If the edge split algorithm is unknown.
        """
        # Compute train/test split
        if split_alg == 'random':
            tr_E, te_E = stt.rand_split_train_test(G, train_frac)
            train_E, test_E, G, mp = pp.relabel_nodes(tr_E, te_E,
                                                      G.is_directed())
        elif split_alg == 'naive':
            train_E, test_E = stt.naive_split_train_test(G, train_frac)
        elif split_alg == 'spanning_tree':
            train_E, test_E = stt.split_train_test(G, train_frac)
        elif split_alg == 'fast':
            train_E, test_E = stt.quick_split(G, train_frac)
            train_E_false, test_E_false = stt.quick_nonedges(
                G, train_frac, fe_ratio)
        elif split_alg == 'timestamp':
            train_E, test_E, G = stt.timestamp_split(G, train_frac)
            train_E = set(zip(train_E[:, 0], train_E[:, 1]))
            test_E = set(zip(test_E[:, 0], test_E[:, 1]))
        else:
            raise ValueError('Split alg. {} unknown!'.format(split_alg))

        # Compute non-edges
        if split_alg != 'fast':
            num_fe_train = len(train_E) * fe_ratio
            num_fe_test = len(test_E) * fe_ratio
            if owa:
                train_E_false, test_E_false = stt.generate_false_edges_owa(
                    G, train_E, test_E, num_fe_train, num_fe_test)
            else:
                train_E_false, test_E_false = stt.generate_false_edges_cwa(
                    G, train_E, test_E, num_fe_train, num_fe_test)

        # Set class attributes to new values
        self.set_splits(train_E,
                        train_E_false,
                        test_E,
                        test_E_false,
                        directed=G.is_directed(),
                        nw_name=nw_name,
                        split_id=split_id,
                        split_alg=split_alg,
                        owa=owa,
                        verbose=verbose)

        return train_E, train_E_false, test_E, test_E_false
Ejemplo n.º 5
0
    def compute_splits(self,
                       G,
                       nw_name='test',
                       train_frac=0.51,
                       split_alg='spanning_tree',
                       split_id=0,
                       verbose=False):
        """
        Computes sets of train and test positive and negative edges according to the given input parameters and
        initializes the class attributes.

        Parameters
        ----------
        G : graph
            A NetworkX graph or digraph to compute the train test split from.
        nw_name : string, optional
            A string indicating the name of the dataset from which this split was generated.
            This is required in order to keep track of the evaluation results. Default is 'test'.
        train_frac : float, optional
            The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
            Default is 0.51.
        split_alg : string, optional
            A string indicating the algorithm to use for generating the train/test splits. Options are `spanning_tree`,
            `random`, `naive`, `fast` and `timestamp`. Default is `spanning_tree`.
        split_id : int, optional
            The id to be assigned to the train/test splits generated. Default is 0.
        verbose : bool, optional
            If True print progress info. Default is False.

        Returns
        -------
        train_E : set
            The set of train positive edges.
        train_false_E : set
            The set of train negative edges.
        test_E : set
            The set of test positive edges.
        test_false_E : set
            The set of test negative edges.

        Raises
        ------
        ValueError
            If the edge split algorithm is unknown.
        """
        # Compute train/test split
        if split_alg == 'random':
            tr_E, te_E = stt.rand_split_train_test(G, train_frac)
            train_E, test_E, G, mp = pp.relabel_nodes(tr_E, te_E,
                                                      G.is_directed())
        elif split_alg == 'naive':
            train_E, test_E = stt.naive_split_train_test(G, train_frac)
        elif split_alg == 'spanning_tree':
            train_E, test_E = stt.split_train_test(G, train_frac)
        elif split_alg == 'fast':
            train_E, test_E = stt.quick_split(G, train_frac)
        elif split_alg == 'timestamp':
            train_E, test_E, _ = stt.timestamp_split(G, train_frac)
        else:
            raise ValueError('Split alg. {} unknown!'.format(split_alg))

        # Make sure the edges are numpy arrays
        train_E = np.array(list(train_E))
        test_E = np.array(list(test_E))

        # Get the labels of train and test
        a = nx.adj_matrix(G)
        tr_labels = np.ravel(a[train_E[:, 0], train_E[:, 1]])
        te_labels = np.ravel(a[test_E[:, 0], test_E[:, 1]])

        # Split train and test edges in those with positive and negative signs
        pos_tr_e = train_E[np.where(tr_labels == 1)[0], :]
        neg_tr_e = train_E[np.where(tr_labels == -1)[0], :]
        pos_te_e = test_E[np.where(te_labels == 1)[0], :]
        neg_te_e = test_E[np.where(te_labels == -1)[0], :]

        # Make a train graph with appropriate weights +1 / -1
        H = G.copy()
        H.remove_edges_from(test_E)

        # Set class attributes to new values
        self.set_splits(train_E=pos_tr_e,
                        train_E_false=neg_tr_e,
                        test_E=pos_te_e,
                        test_E_false=neg_te_e,
                        directed=G.is_directed(),
                        nw_name=nw_name,
                        TG=H,
                        split_id=split_id,
                        split_alg=split_alg,
                        verbose=verbose)

        return pos_tr_e, neg_tr_e, pos_te_e, neg_te_e
Ejemplo n.º 6
0
                            train_frac=0.51, num_fe_train=None, num_fe_test=None, num_splits=5)

# The overlap between the 5 generated sets can be easily checked
print("Overlap check for train sets: ")
stt.check_overlap(filename=os.path.join(traintest_path, "network_prep_51", "trE"), num_sets=5)
print("Overlap check for test sets: ")
stt.check_overlap(filename=os.path.join(traintest_path, "network_prep_51", "teE"), num_sets=5)

# The same computations can be performed for the sets of non-edges
# print "Overlap check for negative train sets: "
# stt.check_overlap(filename=output_path + "lp_train_test_splits/network_prep_51_negTrE", num_sets=5)
# print "Overlap check for negative test sets: "
# stt.check_overlap(filename=output_path + "lp_train_test_splits/network_prep_51_negTeE", num_sets=5)

# Alternatively, train/test splits can be computed one at a time
train_E, test_E = stt.split_train_test(SG, train_frac=0.50)

# Compute set of false edges
# train_E_false, test_E_false = stt.generate_false_edges_owa(SG, train_E=train_E, test_E=test_E, num_fe_train=None,
#                                                            num_fe_test=None)
train_E_false, test_E_false = stt.generate_false_edges_cwa(SG, train_E=train_E, test_E=test_E, num_fe_train=None,
                                                           num_fe_test=None)

# Store the computed edge sets to a file
filenames = stt.store_train_test_splits(os.path.join(output_path, "lp_train_test_splits", "network_prep_51"),
                                        train_E=train_E, train_E_false=train_E_false, test_E=test_E,
                                        test_E_false=test_E_false, split_id=0)

# -------------------------------------------
# Link prediction (LP) using baseline methods
# -------------------------------------------