コード例 #1
0
def shortest_path_cover_logn_apx(g: gt.Graph, weight: gt.EdgePropertyMap):
    started_with_directed = g.is_directed()
    if not g.is_directed():
        reversed_edges = np.fliplr(g.get_edges())
        g.set_directed(True)
        g.add_edge_list(reversed_edges)
        weight.a[-reversed_edges.shape[0]:] = weight.a[:reversed_edges.
                                                       shape[0]]

    if weight.value_type() not in [
            "bool", "int", "int16_t", "int32_t", "int64_t"
    ]:
        #min = np.min(weight.a)
        #min_second = np.min(weight.a[weight.a > min])

        eps = 1  #min_second - min
        scaled_weight = (np.ceil(weight.a / eps) *
                         (g.num_vertices() + 1)).astype(np.int)  # ints >= 1
    else:
        scaled_weight = weight.a * (g.num_vertices() + 1)

    summed_edge_weight = np.sum(scaled_weight)

    adjusted_weight = g.new_edge_property("long", vals=scaled_weight - 1)

    paths = []

    covered_vertices = set()

    while len(covered_vertices) != g.num_vertices():
        curr_paths = shortest_path_visiting_most_nodes(g, adjusted_weight,
                                                       covered_vertices,
                                                       summed_edge_weight)

        for path in curr_paths:
            paths.append(path)

            #if len(path) <= 2 switch to fast mode and just add single edges/vertices until done.
            path_vertices = set(path)
            for v in path_vertices.difference(covered_vertices):
                for w in g.get_in_neighbors(v):
                    adjusted_weight[g.edge(w, v)] += 1  #.a[list()] -= 1
                    if adjusted_weight[g.edge(
                            w, v)] % (g.num_vertices() + 1) != 0:
                        exit(5)

            new_covered = path_vertices.difference(covered_vertices)
            covered_vertices = covered_vertices.union(path_vertices)
            print(len(new_covered), len(path), len(covered_vertices), path)
    if not started_with_directed:
        g.set_directed(False)
        for e in reversed_edges:
            g.remove_edge(g.edge(e[0], e[1]))
    return paths
コード例 #2
0
ファイル: cumulative.py プロジェクト: nestauk/rhodonite
def cumulative_cooccurrence_graph(steps, sequences, directed=False):
    '''cumulative_cooccurrence_graph
    Creates a cumulative cooccurrence graph.

    Parameters
    ----------
    steps : :obj:`iter` of :obj:`int` or :obj:`str` 
        A series that contains sequential labels for the nested groups.
    sequences : :obj:`iter` of :obj:`iter` of :obj:`int` 
        Nested iterable of integers representing vertices in the graph. Number 
        of nested iterables should be equal to `len(steps)`.
    directed : :obj:`bool` 
        Currently has no effect. In future this will determine whether to build 
        a bi-directional cooccurrence graph.

    Returns
    -------
    g : :obj:`graph_tool.Graph`
        A graph. Vertices are elements. Edges link terms that have cooccurred 
        at least once in the series.
    o_props : :obj:`dict` 
        Property maps with vertex occurrence values at each step.
    o_cumsum_props : :obj:`dict` 
        Property maps with cumulative vertex cooccurrence values at each step.
    co_props : :obj:`dict` 
        Property maps with edge cooccurrnce values at each step.
    co_cumsum_props : :obj:`dict`
        Property maps with cumulative edge cooccurrence values at each step.
    '''

    g = Graph(directed=directed)

    o_total = Counter(chain(*chain(*sequences)))
    n_vertices = len(o_total)
    g.add_vertex(n_vertices)
    o_max = dict_to_vertex_prop(g, o_total, 'int')

    co_total = cooccurrence_counts(chain(*sequences))
    edge_list = ((c[0], c[1], count) for c, count in co_total.items())
    co_max = g.new_edge_property('int')
    g.add_edge_list(edge_list, eprops=[co_max])

    edges = g.get_edges()
    edge_indices = dict(zip([(e[0], e[1]) for e in edges], edges[:, 2]))

    o_props = {}
    co_props = {}
    o_cumsum_props = {}
    co_cumsum_props = {}
    for i, (step, seq) in enumerate(zip(steps[:-1], sequences[:-1])):
        logging.info(f'Calculating cooccurrences at step {step}')
        o_step = Counter(chain(*seq))
        o_props[step] = dict_to_vertex_prop(g, o_step, 'int')

        combos = (combinations(sorted(ids), 2) for ids in seq)
        co_step = Counter(chain(*combos))
        co_props[step] = dict_to_edge_prop(g, co_step, 'int', edge_indices)

        o_cumsum = g.new_vertex_property('int')
        co_cumsum = g.new_edge_property('int')
        if i == 0:
            o_cumsum.a = o_cumsum.a + o_props[step].a
            co_cumsum.a = co_cumsum.a + co_props[step].a
        else:
            o_cumsum.a = o_cumsum_props[steps[i - 1]].a + o_props[step].a
            co_cumsum.a = co_cumsum_props[steps[i - 1]].a + co_props[step].a
        o_cumsum_props[step] = o_cumsum
        co_cumsum_props[step] = co_cumsum
    # fill in the last step without needing to count occurrences
    # or cooccurrences
    step_max = steps[-1]
    o = g.new_vertex_property('int')
    co = g.new_edge_property('int')
    o.a = o_max.a - o_cumsum.a
    co.a = co_max.a - co_cumsum.a
    o_props[step_max] = o
    co_props[step_max] = co

    o_cumsum_props[step_max] = o_max
    co_cumsum_props[step_max] = co_max

    steps_prop = g.new_graph_property('vector<int>')
    steps_prop.set_value(steps)
    g.gp['steps'] = steps_prop

    return g, o_props, o_cumsum_props, co_props, co_cumsum_props
コード例 #3
0
class GraphDataset:
    """
    Class for managing datasets with graph data
    """
    def __init__(self, name, edges, object_ids, weights, hidden_graph=None):
        """
        Params:
            name (str): unique string to name this dataset (for pickling and
                unpickling)
            edges (numpy.ndarray): numpy array of shape [num_edges, 2]
                containing the indices of nodes in all edges
            objects (List[str]): string object ids for all nodes
            weights (numpy.ndarray): numpy array of shape [num_edges]
                containing edge weights
            hidden_graph (GraphDataset): Graph data that should be excluded
                but not considered as negative edges. (i.e. train
                edges should not be in eval dataset but they shouldn't be
                counted as negatives either)
        """

        self.name = name
        self.edges = edges
        self.object_ids = np.asarray(object_ids)
        self.weights = weights
        self.hidden_graph = hidden_graph

        self.graph = Graph(directed=False)
        self.graph.add_vertex(len(object_ids))
        edge_weights = [[edge[0], edge[1], weight]
                        for edge, weight in zip(self.edges, self.weights)]
        self.weight_property = self.graph.new_edge_property("float")
        eprops = [self.weight_property]
        self.graph.add_edge_list(edge_weights, eprops=eprops)
        self.manifold_nns = None

    def gen_neighbor_data(self, verbose=True) -> Dict:
        """
        Generates the graph data needed to run the cython iterator
        Returns a dict with the neighbor data which will have values

        - 'non_empty_vertices' the indices of vertices which have edges
           emanating from them
        - 'all_graph_neighbors' a list of lists of ints such that the list of
          edges emanating from the vertex with index non_empty_vertices[i] is
          stored in all_graph_neighbors[i]
        - 'all_graph_weights' a list of lists of ints such that
          all_graph_weights[i][j] represents the weight of the connection in
          all_graph_neighbors[i][j]
        - 'N' number of nodes in the graph

        Parameters:
            verbose (bool): should graph loading be printed out
        """

        all_graph_neighbors = []
        all_graph_weights = []
        non_empty_vertices = []
        empty_vertices = []
        if verbose:
            iterator = tqdm(range(self.n_nodes()),
                            desc="Generating Neighbor Data",
                            dynamic_ncols=True)
        else:
            iterator = range(self.n_nodes())

        for i in iterator:
            in_edges = self.graph.get_in_edges(i, [self.weight_property])
            out_edges = self.graph.get_out_edges(i, [self.weight_property])
            if in_edges.size + out_edges.size > 0:
                non_empty_vertices.append(i)
                if in_edges.size == 0:
                    all_graph_neighbors.append(out_edges[:,
                                                         1].astype(np.int64))
                    all_graph_weights.append(out_edges[:,
                                                       2].astype(np.float32))
                elif out_edges.size == 0:
                    all_graph_neighbors.append(in_edges[:, 1].astype(np.int64))
                    all_graph_weights.append(in_edges[:, 2].astype(np.float32))
                else:
                    all_graph_neighbors.append(
                        np.concatenate([in_edges[:, 0],
                                        out_edges[:, 1]]).astype(np.int64))
                    all_graph_weights.append(
                        np.concatenate([in_edges[:, 2],
                                        out_edges[:, 2]]).astype(np.float32))
            else:
                empty_vertices.append(i)

        # graph_neighbors = np.concatenate(all_graph_neighbors)
        # graph_neighbor_weights = np.concatenate(all_graph_weights)
        non_empty_vertices = np.array(non_empty_vertices, dtype=np.int64)
        empty_vertices = np.array(empty_vertices, dtype=np.int64)

        return {
            "all_graph_neighbors": all_graph_neighbors,
            "all_graph_weights": all_graph_weights,
            "non_empty_vertices": non_empty_vertices,
            "empty_vertices": empty_vertices,
            "N": self.n_nodes()
        }

    def add_manifold_nns(self, graph_embedder: GraphEmbedder):
        manifold = graph_embedder.get_manifold()
        data_points = graph_embedder.retrieve_nodes(self.n_nodes())
        self.manifold_nns = ManifoldNNS(data_points, manifold)

    def n_nodes(self) -> int:
        """
        Returns the number of nodes in the graph
        """
        return len(self.object_ids)

    def collapse_nodes(self, node_ids):
        all_new_edges = []
        for node_id in tqdm(node_ids,
                            desc="Collapsing Nodes",
                            dynamic_ncols=True):
            in_edges = self.graph.get_in_edges(node_id, [self.weight_property])
            out_edges = self.graph.get_out_edges(node_id,
                                                 [self.weight_property])
            neighbors = np.concatenate([out_edges[:, 1:3], in_edges[:, 0:3:2]])
            if neighbors.shape[0] > 1:
                neighbor_combos = \
                    neighbors[comb_index(neighbors.shape[0], 2)]
                neighbor_combos = \
                    neighbor_combos.reshape(neighbor_combos.shape[0], 4)
                new_edges = np.zeros((neighbor_combos.shape[0], 3))
                new_edges[:, :2] += neighbor_combos[:, 0:3:2]
                new_edges[:,2] += (neighbor_combos[:,1] + \
                                   neighbor_combos[:,3])/4
                all_new_edges.append(new_edges)

        self.graph.add_edge_list(np.concatenate(all_new_edges),
                                 eprops=[self.weight_property])

        self.object_ids = np.delete(self.object_ids, np.array(node_ids))
        self.graph.remove_vertex(node_ids)

        edges_weights = self.graph.get_edges(eprops=[self.weight_property])
        edges = edges_weights[:, 0:2]
        weights = edges_weights[:, 2]
        self.edges = edges
        self.weights = weights

    def get_neighbor_iterator(
        self,
        graph_sampling_config: GraphSamplingConfig,
        data_fraction: float = 1,
    ) -> Iterator[GraphDataBatch]:
        """
        Gets an efficient iterator of edge batches
        """
        neighbor_data = load_or_gen(f"GraphDataset.{self.name}",
                                    self.gen_neighbor_data)
        if self.hidden_graph is None:
            # GraphDataBatchIterator is defined in cython with these arguments.
            # noinspection PyArgumentList
            iterator = GraphDataBatchIterator(neighbor_data,
                                              graph_sampling_config)
            iterator.data_fraction = data_fraction

        else:
            hidden_neighbor_data = load_or_gen(
                f"GraphDataset.{self.hidden_graph.name}",
                self.hidden_graph.gen_neighbor_data)

            # GraphDataBatchIterator is defined in cython with these arguments.
            # noinspection PyArgumentList
            iterator = GraphDataBatchIterator(neighbor_data,
                                              graph_sampling_config,
                                              hidden_neighbor_data)
            iterator.data_fraction = data_fraction

        if self.manifold_nns is not None:
            sampling_config = get_config().sampling
            _, nns = \
                self.manifold_nns.knn_query_all(sampling_config.manifold_nn_k)

            all_manifold_neighbors = [
                nns[i][1:].astype(np.int64) for i in range(self.n_nodes())
            ]
            iterator.refresh_manifold_nn(all_manifold_neighbors)

        return iterator

    @classmethod
    def make_train_eval_split(cls, name, edges, object_ids, weights):
        """
        Returns a tuple of a train eval split of the graph as defined in the
        data config.
        """

        data_config = get_config().data
        np.random.seed(data_config.split_seed)
        if data_config.split_by_edges:
            # TODO Doesn't save to file in this mode
            shuffle_order = np.arange(edges.shape[0])
            np.random.shuffle(shuffle_order)
            num_eval = floor(edges.shape[0] * data_config.split_size)
            eval_indices = shuffle_order[:num_eval]
            train_indices = shuffle_order[num_eval:]
            train_edges = edges[train_indices]
            train_weights = weights[train_indices]
            eval_edges = edges[eval_indices]
            eval_weights = weights[eval_indices]
        else:
            shuffle_order = np.arange(len(object_ids))
            np.random.shuffle(shuffle_order)
            num_eval = floor(len(object_ids) * data_config.split_size)
            eval_indices = shuffle_order[:num_eval]

            test_set = data_config.generate_test_set
            if test_set:
                test_indices = shuffle_order[num_eval:2 * num_eval]
            train_indices = shuffle_order[2 * num_eval:] if test_set else \
                shuffle_order[num_eval:]

            train_edges = []
            eval_edges = []
            train_weights = []
            eval_weights = []
            if test_set:
                test_edges = []
                test_weights = []

            for edge, weight in zip(edges, weights):
                if test_set and (edge[0] in test_indices
                                 or edge[1] in test_indices):
                    test_edges.append(edge)
                    test_weights.append(weight)
                elif edge[0] in eval_indices or edge[1] in eval_indices:
                    eval_edges.append(edge)
                    eval_weights.append(weight)
                else:
                    train_edges.append(edge)
                    train_weights.append(weight)

            if test_set:
                save_graph_data(test_edges, test_weights, object_ids,
                                data_config.test_path)
            save_graph_data(train_edges, train_weights, object_ids,
                            data_config.train_path)
            save_graph_data(eval_edges, eval_weights, object_ids,
                            data_config.eval_path)

            train_edges = np.array(train_edges)
            eval_edges = np.array(eval_edges)
            train_weights = np.array(train_weights)
            eval_weights = np.array(eval_weights)

        train_data = GraphDataset(f"{name}_train", train_edges, object_ids,
                                  train_weights)

        eval_data = GraphDataset(f"{name}_eval",
                                 eval_edges,
                                 object_ids,
                                 eval_weights,
                                 hidden_graph=train_data)

        return train_data, eval_data