def parse_input(self, X):
        """Parse and create features for the `subgraph_matching` kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter = True
                    x = list(x)

                if type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency"),
                        x.get_labels(purpose="adjacency", label_type="edge"),
                        self._graph_format)
                elif is_iter and len(x) in [0, 3]:
                    x = list(x)
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    elif len(x) == 3:
                        g = Graph(x[0], x[1], x[2], "adjacency")
                        g.change_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node, ' +
                                    'edge labels dict \n')
                n = g.nv()
                E = g.get_edge_dictionary()
                L = g.get_labels(purpose="dictionary",
                                 return_none=(self.kv is None))
                Le = g.get_labels(purpose="dictionary",
                                  label_type="edge",
                                  return_none=(self.ke is None))
                Er = set(
                    (a, b) for a in E.keys() for b in E[a].keys() if a != b)

                i += 1
                out.append((n, Er, L, Le))

            if i == 0:
                raise ValueError('parsed input is empty')
            return out
Beispiel #2
0
    def parse_input(self, X):
        """Parse and check the given input for the Graph Hopper kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format).

        Returns
        -------
        out : np.array, shape=(len(X), n_labels)
            A np array for frequency (cols) histograms for all Graphs (rows).

        """
        if not isinstance(X, Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ni = 0
            diam = list()
            graphs = list()
            for (i, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, Iterable):
                    is_iter = True
                    x = list(x)

                if type(x) is Graph:
                    g = Graph(x.get_adjacency_matrix(),
                              x.get_labels(purpose="adjacency"), {},
                              self._graph_format)
                elif is_iter and len(x) == 0 or len(x) >= 2:
                    if len(x) == 0:
                        warn('Ignoring empty element on index: ' + str(i))
                        continue
                    elif len(x) >= 2:
                        g = Graph(x[0], x[1], {}, "adjacency")
                        g.change_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node, ')

                spm, attr = g.build_shortest_path_matrix(labels="vertex")
                nv = g.nv()
                try:
                    attributes = np.array([attr[j] for j in range(nv)])
                except TypeError:
                    raise TypeError(
                        'All attributes of a single graph should have the same dimension.'
                    )
                diam.append(int(np.max(spm[spm < float("Inf")])))
                graphs.append((g.get_adjacency_matrix(), nv, attributes))
                ni += 1

        if self._method_calling == 1:
            max_diam = self._max_diam = max(diam) + 1
        else:
            max_diam = max(self._max_diam, max(diam) + 1)

        out = list()
        for i in range(ni):
            AM, node_nr, attributes = graphs[i]
            des = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int)
            occ = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int)

            # Convert adjacency matrix to dictionary
            idx_i, idx_j = np.where(AM > 0)
            ed = defaultdict(dict)
            for (a, b) in filterfalse(lambda a: a[0] == a[1],
                                      zip(idx_i, idx_j)):
                ed[a][b] = AM[a, b]

            for j in range(node_nr):
                A = np.zeros(shape=AM.shape)

                # Single-source shortest path from node j
                D, p = dijkstra(ed, j)

                D = np.array(
                    list(D.get(k, float("Inf")) for k in range(node_nr)))
                p[j] = -1

                # Restrict to the connected component of node j
                conn_comp = np.where(D < float("Inf"))[0]

                # To-be DAG adjacency matrix of connected component of node j
                A_cc = A[conn_comp, :][:, conn_comp]

                # Adjacency matrix of connected component of node j
                AM_cc = AM[conn_comp, :][:, conn_comp]
                D_cc = D[conn_comp]
                conn_comp_converter = np.zeros(shape=(A.shape[0], 1),
                                               dtype=int)
                for k in range(conn_comp.shape[0]):
                    conn_comp_converter[conn_comp[k]] = k
                conn_comp_converter = np.vstack([0, conn_comp_converter])
                p_cc = conn_comp_converter[
                    np.array(list(p[k] for k in conn_comp)) + 1]

                # Number of nodes in connected component of node j
                conncomp_node_nr = A_cc.shape[0]
                for v in range(conncomp_node_nr):
                    if p_cc[v] > 0:
                        # Generate A_cc by adding directed edges of form (parent(v), v)
                        A_cc[p_cc[v], v] = 1

                    # Distance from v to j
                    v_dist = D_cc[v]

                    # All neighbors of v in the undirected graph
                    v_nbs = np.where(AM_cc[v, :] > 0)[0]

                    # Distances of neighbors of v to j
                    v_nbs_dists = D_cc[v_nbs]

                    # All neighbors of v in undirected graph who are
                    # one step closer to j than v is; i.e. SP-DAG parents
                    v_parents = v_nbs[v_nbs_dists == (v_dist - 1)]

                    # Add SP-DAG parents to A_cc
                    A_cc[v_parents, v] = 1

                # Computes the descendants & occurence vectors o_j(v), d_j(v)
                # for all v in the connected component
                occ_p, des_p = od_vectors_dag(A_cc, D_cc)

                if des_p.shape[0] == 1 and j == 0:
                    des[j, 0, 0] = des_p
                    occ[j, 0, 0] = occ_p
                else:
                    # Convert back to the indices of the original graph
                    for v in range(des_p.shape[0]):
                        for l in range(des_p.shape[1]):
                            des[j, conn_comp[v], l] = des_p[v, l]
                    # Convert back to the indices of the original graph
                    for v in range(occ_p.shape[0]):
                        for l in range(occ_p.shape[1]):
                            occ[j, conn_comp[v], l] = occ_p[v, l]

            M = np.zeros(shape=(node_nr, max_diam, max_diam))
            # j loops through choices of root
            for j in range(node_nr):
                des_mat_j_root = np.squeeze(des[j, :, :])
                occ_mat_j_root = np.squeeze(occ[j, :, :])
                # v loops through nodes
                for v in range(node_nr):
                    for a in range(max_diam):
                        for b in range(a, max_diam):
                            # M[v,:,:] is M[v]; a = node coordinate in path, b = path length
                            M[v, a,
                              b] += des_mat_j_root[v, b -
                                                   a] * occ_mat_j_root[v, a]

            if self.calculate_norm_:
                out.append((M, attributes, np.sum(attributes**2, axis=1)))
            else:
                out.append((M, attributes))
        return out
Beispiel #3
0
    def parse_input(self, X):
        """Parse input and create features, while initializing and/or calculating sub-kernels.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_graph_kernel : object
            Returns base_graph_kernel. Only if called from `fit` or `fit_transform`.

        K : np.array
            Returns the kernel matrix. Only if called from `transform` or
            `fit_transform`.

        """
        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx, max_core_number, core_numbers, graphs = 0, 0, [], []
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                extra = tuple()
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) >= 0:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    elif len(x) == 1:
                        x = Graph(x[0], {}, {}, graph_format="adjacency")
                    elif len(x) == 2:
                        x = Graph(x[0], x[1], {}, graph_format="adjacency")
                    elif len(x) >= 3:
                        if len(x) > 3:
                            extra += tuple(x[3:])
                        x = Graph(x[0], x[1], x[2], graph_format="adjacency")
                elif type(x) is Graph:
                    x.desired_format("adjacency")
                    x = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency",
                                     label_type="vertex",
                                     return_none=True),
                        x.get_labels(purpose="adjacency",
                                     label_type="edge",
                                     return_none=True))
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node labels '
                                    'dict \n')
                # workaround for leaving a sparse representation for x
                x.change_format(self._graph_format)
                c = core_number(x)
                max_core_number = max(max_core_number, max(c.values()))
                core_numbers.append(c)
                graphs.append((x, extra))

                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        if max_core_number <= self.min_core:
            raise ValueError(
                'The maximum core equals the min_core boundary set in init.')

        # Add the zero iteration element
        if self._method_calling == 2:
            K = np.zeros(shape=(nx, nx))
        elif self._method_calling == 3:
            self._dummy_kernel = dict()
            K = np.zeros(shape=(nx, self._nx))

        # Main
        base_graph_kernel, indexes_list = dict(), dict()
        for i in range(max_core_number, self.min_core, -1):
            subgraphs, indexes = list(), list()
            for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers,
                                                         graphs)):
                vertices = [k for k, v in iteritems(cn) if v >= i]
                if len(vertices) > 0:
                    # Calculate subgraph and store the index of the non-empty vertices
                    sg = g.get_subgraph(vertices)
                    sub_extra = list()
                    indexes.append(idx)
                    if len(extra) > 0:
                        vs = np.array(sg.get_vertices(purpose='any'))
                        for e in extra:
                            # This case will only be reached by now if the user add the propagation
                            # kernel as subkernel with a custom propagation matrix. This is a workaround!
                            if type(e) is np.array and len(e.shape) == 2:
                                e = e[vs, :][:, vs]
                            sub_extra.append(e)
                        subgraphs.append((sg, ) + tuple(sub_extra))
                    else:
                        subgraphs.append(sg)
            indexes = np.array(indexes)
            indexes_list[i] = indexes

            # calculate kernel
            if self._method_calling == 1 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                base_graph_kernel[i].fit(subgraphs)
            elif self._method_calling == 2 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs)
                for j in range(indexes.shape[0]):
                    K[indexes[j], indexes] += ft_subgraph_mat[j, :]
            elif self._method_calling == 3:
                if self._max_core_number < i or self._fit_indexes[i].shape[
                        0] == 0:
                    if len(indexes) > 0:
                        # add a dummy kernel for calculating the diagonal
                        self._dummy_kernel[i] = self.base_graph_kernel_(
                            **self.params_)
                        self._dummy_kernel[i].fit(subgraphs)
                else:
                    if indexes.shape[0] > 0:
                        subgraph_tmat = self.X[i].transform(subgraphs)
                        for j in range(indexes.shape[0]):
                            K[indexes[j],
                              self._fit_indexes[i]] += subgraph_tmat[j, :]

        if self._method_calling == 1:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return base_graph_kernel
        elif self._method_calling == 2:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return K, base_graph_kernel
        elif self._method_calling == 3:
            self._t_nx = nx
            self._max_core_number_trans = max_core_number
            self._transform_indexes = indexes_list
            return K
Beispiel #4
0
    def parse_input(self, X):
        """Parse and create features for the NSPD kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        M : dict
            A dictionary with keys all the distances from 0 to self.d
            and values the the np.arrays with rows corresponding to the
            non-null input graphs and columns to the enumerations of tuples
            consisting of pairs of hash values and radius, from all the given
            graphs of the input (plus the fitted one's on transform).

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            # Hold the number of graphs
            ng = 0

            # Holds all the data for combinations of r, d
            data = collections.defaultdict(dict)

            # Index all keys for combinations of r, d
            all_keys = collections.defaultdict(dict)
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        g = Graph(x[0], x[1], x[2])
                        g.change_format("adjacency")
                elif type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency", label_type="vertex"),
                        x.get_labels(purpose="adjacency", label_type="edge"))
                else:
                    raise TypeError('each element of X must have either ' +
                                    'a graph with labels for node and edge ' +
                                    'or 3 elements consisting of a graph ' +
                                    'type object, labels for vertices and ' +
                                    'labels for edges.')

                # Bring to the desired format
                g.change_format(self._graph_format)

                # Take the vertices
                vertices = set(g.get_vertices(purpose=self._graph_format))

                # Extract the dicitionary
                ed = g.get_edge_dictionary()

                # Convert edges to tuples
                edges = {(j, k) for j in ed.keys() for k in ed[j].keys()}

                # Extract labels for nodes
                Lv = g.get_labels(purpose=self._graph_format)
                # and for edges
                Le = g.get_labels(purpose=self._graph_format,
                                  label_type="edge")

                # Produce all the neighborhoods and the distance pairs
                # up to the desired radius and maximum distance
                N, D, D_pair = g.produce_neighborhoods(self.r,
                                                       purpose="dictionary",
                                                       with_distances=True,
                                                       d=self.d)

                # Hash all the neighborhoods
                H = self._hash_neighborhoods(vertices, edges, Lv, Le, N,
                                             D_pair)

                if self._method_calling == 1:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            for r in range(self.r + 1):
                                key = (H[r, A], H[r, B])
                                keys = all_keys[r, d]
                                idx = keys.get(key, None)
                                if idx is None:
                                    idx = len(keys)
                                    keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1

                elif self._method_calling == 3:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            # Based on the edges of the bidirected graph
                            for r in range(self.r + 1):
                                keys = all_keys[r, d]
                                fit_keys = self._fit_keys[r, d]
                                key = (H[r, A], H[r, B])
                                idx = fit_keys.get(key, None)
                                if idx is None:
                                    idx = keys.get(key, None)
                                    if idx is None:
                                        idx = len(keys) + len(fit_keys)
                                        keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1
                ng += 1
            if ng == 0:
                raise ValueError('parsed input is empty')

            if self._method_calling == 1:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix((data, (rows, cols)),
                                        shape=(ng, len(all_keys[key])),
                                        dtype=np.int64)
                self._fit_keys = all_keys
                self._ngx = ng

            elif self._method_calling == 3:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix(
                        (data, (rows, cols)),
                        shape=(ng,
                               len(all_keys[key]) + len(self._fit_keys[key])),
                        dtype=np.int64)

                self._ngy = ng

            return M