Beispiel #1
0
    def parse_input(self, X):
        """Parse and create features for svm_theta kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The lovasz metrics for the given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], {}, {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                i += 1
                A = x.get_adjacency_matrix()
                dual_coeffs = _calculate_svm_theta_(A)
                out.append(self._calculate_svm_theta_levels_(A, dual_coeffs))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #2
0
    def parse_input(self, X):
        """Parse and create features for graphlet_sampling kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            proc = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 2 ' +
                                    'and at most 3 elements\n')
                i += 1
                x.desired_format("adjacency")
                Ax = x.get_adjacency_matrix()
                Lx = x.get_labels(purpose="adjacency")
                Lx = [Lx[idx] for idx in range(Ax.shape[0])]
                proc.append((Ax, Lx, Ax.shape[0]))

            out = list()
            for Ax, Lx, s in proc:
                amss = dict()
                labels = set(Lx)
                Lx = np.array(Lx)
                for t in product(labels, labels):
                    selector = np.matmul(np.expand_dims(Lx == t[0], axis=1),
                                         np.expand_dims(Lx == t[1], axis=0))
                    amss[t] = Ax * selector
                out.append((amss, s))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #3
0
    def parse_input(self, X):
        """Parse input and create features, while initializing and/or calculating sub-kernels.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_graph_kernel : object
            Returns base_graph_kernel. Only if called from `fit` or `fit_transform`.

        K : np.array
            Returns the kernel matrix. Only if called from `transform` or
            `fit_transform`.

        """
        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx, max_core_number, core_numbers, graphs = 0, 0, [], []
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                extra = tuple()
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) >= 0:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    elif len(x) == 1:
                        x = Graph(x[0], {}, {}, graph_format="adjacency")
                    elif len(x) == 2:
                        x = Graph(x[0], x[1], {}, graph_format="adjacency")
                    elif len(x) >= 3:
                        if len(x) > 3:
                            extra += tuple(x[3:])
                        x = Graph(x[0], x[1], x[2], graph_format="adjacency")
                elif type(x) is Graph:
                    x.desired_format("adjacency")
                    x = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency",
                                     label_type="vertex",
                                     return_none=True),
                        x.get_labels(purpose="adjacency",
                                     label_type="edge",
                                     return_none=True))
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node labels '
                                    'dict \n')
                # workaround for leaving a sparse representation for x
                x.change_format(self._graph_format)
                c = core_number(x)
                max_core_number = max(max_core_number, max(c.values()))
                core_numbers.append(c)
                graphs.append((x, extra))

                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        if max_core_number <= self.min_core:
            raise ValueError(
                'The maximum core equals the min_core boundary set in init.')

        # Add the zero iteration element
        if self._method_calling == 2:
            K = np.zeros(shape=(nx, nx))
        elif self._method_calling == 3:
            self._dummy_kernel = dict()
            K = np.zeros(shape=(nx, self._nx))

        # Main
        base_graph_kernel, indexes_list = dict(), dict()
        for i in range(max_core_number, self.min_core, -1):
            subgraphs, indexes = list(), list()
            for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers,
                                                         graphs)):
                vertices = [k for k, v in iteritems(cn) if v >= i]
                if len(vertices) > 0:
                    # Calculate subgraph and store the index of the non-empty vertices
                    sg = g.get_subgraph(vertices)
                    sub_extra = list()
                    indexes.append(idx)
                    if len(extra) > 0:
                        vs = np.array(sg.get_vertices(purpose='any'))
                        for e in extra:
                            # This case will only be reached by now if the user add the propagation
                            # kernel as subkernel with a custom propagation matrix. This is a workaround!
                            if type(e) is np.array and len(e.shape) == 2:
                                e = e[vs, :][:, vs]
                            sub_extra.append(e)
                        subgraphs.append((sg, ) + tuple(sub_extra))
                    else:
                        subgraphs.append(sg)
            indexes = np.array(indexes)
            indexes_list[i] = indexes

            # calculate kernel
            if self._method_calling == 1 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                base_graph_kernel[i].fit(subgraphs)
            elif self._method_calling == 2 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs)
                for j in range(indexes.shape[0]):
                    K[indexes[j], indexes] += ft_subgraph_mat[j, :]
            elif self._method_calling == 3:
                if self._max_core_number < i or self._fit_indexes[i].shape[
                        0] == 0:
                    if len(indexes) > 0:
                        # add a dummy kernel for calculating the diagonal
                        self._dummy_kernel[i] = self.base_graph_kernel_(
                            **self.params_)
                        self._dummy_kernel[i].fit(subgraphs)
                else:
                    if indexes.shape[0] > 0:
                        subgraph_tmat = self.X[i].transform(subgraphs)
                        for j in range(indexes.shape[0]):
                            K[indexes[j],
                              self._fit_indexes[i]] += subgraph_tmat[j, :]

        if self._method_calling == 1:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return base_graph_kernel
        elif self._method_calling == 2:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return K, base_graph_kernel
        elif self._method_calling == 3:
            self._t_nx = nx
            self._max_core_number_trans = max_core_number
            self._transform_indexes = indexes_list
            return K
Beispiel #4
0
    def parse_input(self, X):
        """Parse and check the given input for the Graph Hopper kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format).

        Returns
        -------
        out : np.array, shape=(len(X), n_labels)
            A np array for frequency (cols) histograms for all Graphs (rows).

        """
        if not isinstance(X, Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ni = 0
            diam = list()
            graphs = list()
            for (i, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, Iterable):
                    is_iter = True
                    x = list(x)

                if type(x) is Graph:
                    g = Graph(x.get_adjacency_matrix(),
                              x.get_labels(purpose="adjacency"), {},
                              self._graph_format)
                elif is_iter and len(x) == 0 or len(x) >= 2:
                    if len(x) == 0:
                        warn('Ignoring empty element on index: ' + str(i))
                        continue
                    elif len(x) >= 2:
                        g = Graph(x[0], x[1], {}, "adjacency")
                        g.change_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node, ')

                spm, attr = g.build_shortest_path_matrix(labels="vertex")
                nv = g.nv()
                try:
                    attributes = np.array([attr[j] for j in range(nv)])
                except TypeError:
                    raise TypeError(
                        'All attributes of a single graph should have the same dimension.'
                    )
                diam.append(int(np.max(spm[spm < float("Inf")])))
                graphs.append((g.get_adjacency_matrix(), nv, attributes))
                ni += 1

        if self._method_calling == 1:
            max_diam = self._max_diam = max(diam) + 1
        else:
            max_diam = max(self._max_diam, max(diam) + 1)

        out = list()
        for i in range(ni):
            AM, node_nr, attributes = graphs[i]
            des = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int)
            occ = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int)

            # Convert adjacency matrix to dictionary
            idx_i, idx_j = np.where(AM > 0)
            ed = defaultdict(dict)
            for (a, b) in filterfalse(lambda a: a[0] == a[1],
                                      zip(idx_i, idx_j)):
                ed[a][b] = AM[a, b]

            for j in range(node_nr):
                A = np.zeros(shape=AM.shape)

                # Single-source shortest path from node j
                D, p = dijkstra(ed, j)

                D = np.array(
                    list(D.get(k, float("Inf")) for k in range(node_nr)))
                p[j] = -1

                # Restrict to the connected component of node j
                conn_comp = np.where(D < float("Inf"))[0]

                # To-be DAG adjacency matrix of connected component of node j
                A_cc = A[conn_comp, :][:, conn_comp]

                # Adjacency matrix of connected component of node j
                AM_cc = AM[conn_comp, :][:, conn_comp]
                D_cc = D[conn_comp]
                conn_comp_converter = np.zeros(shape=(A.shape[0], 1),
                                               dtype=int)
                for k in range(conn_comp.shape[0]):
                    conn_comp_converter[conn_comp[k]] = k
                conn_comp_converter = np.vstack([0, conn_comp_converter])
                p_cc = conn_comp_converter[
                    np.array(list(p[k] for k in conn_comp)) + 1]

                # Number of nodes in connected component of node j
                conncomp_node_nr = A_cc.shape[0]
                for v in range(conncomp_node_nr):
                    if p_cc[v] > 0:
                        # Generate A_cc by adding directed edges of form (parent(v), v)
                        A_cc[p_cc[v], v] = 1

                    # Distance from v to j
                    v_dist = D_cc[v]

                    # All neighbors of v in the undirected graph
                    v_nbs = np.where(AM_cc[v, :] > 0)[0]

                    # Distances of neighbors of v to j
                    v_nbs_dists = D_cc[v_nbs]

                    # All neighbors of v in undirected graph who are
                    # one step closer to j than v is; i.e. SP-DAG parents
                    v_parents = v_nbs[v_nbs_dists == (v_dist - 1)]

                    # Add SP-DAG parents to A_cc
                    A_cc[v_parents, v] = 1

                # Computes the descendants & occurence vectors o_j(v), d_j(v)
                # for all v in the connected component
                occ_p, des_p = od_vectors_dag(A_cc, D_cc)

                if des_p.shape[0] == 1 and j == 0:
                    des[j, 0, 0] = des_p
                    occ[j, 0, 0] = occ_p
                else:
                    # Convert back to the indices of the original graph
                    for v in range(des_p.shape[0]):
                        for l in range(des_p.shape[1]):
                            des[j, conn_comp[v], l] = des_p[v, l]
                    # Convert back to the indices of the original graph
                    for v in range(occ_p.shape[0]):
                        for l in range(occ_p.shape[1]):
                            occ[j, conn_comp[v], l] = occ_p[v, l]

            M = np.zeros(shape=(node_nr, max_diam, max_diam))
            # j loops through choices of root
            for j in range(node_nr):
                des_mat_j_root = np.squeeze(des[j, :, :])
                occ_mat_j_root = np.squeeze(occ[j, :, :])
                # v loops through nodes
                for v in range(node_nr):
                    for a in range(max_diam):
                        for b in range(a, max_diam):
                            # M[v,:,:] is M[v]; a = node coordinate in path, b = path length
                            M[v, a,
                              b] += des_mat_j_root[v, b -
                                                   a] * occ_mat_j_root[v, a]

            if self.calculate_norm_:
                out.append((M, attributes, np.sum(attributes**2, axis=1)))
            else:
                out.append((M, attributes))
        return out
Beispiel #5
0
    def parse_input(self, X):
        """Parse and create features for the attributed propation kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        local_values : dict
            A dictionary of pairs between each input graph and a bins where the
            sampled graphlets have fallen.

        """
        if not isinstance(X, collections.Iterable):
            raise ValueError('input must be an iterable\n')
        else:
            # The number of parsed graphs
            n = 0
            transition_matrix = dict()
            indexes = [0]
            Attr = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 2, 3, 4]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on ' +
                                      'index: ' + str(idx))
                        continue
                    if len(x) == 2 and type(x[0]) is Graph:
                        g, T = x
                    else:
                        g = Graph(x[0], x[1], {}, self._graph_format)
                        if len(x) == 4:
                            T = x[3]
                        else:
                            T = None
                elif type(x) is Graph:
                    g, T = x, None
                else:
                    raise ValueError('Each element of X must be either a ' +
                                     'Graph or an iterable with at least 2 ' +
                                     'and at most 4 elements\n')

                if T is not None:
                    if T.shape[0] != T.shape[1]:
                        raise TypeError('Transition matrix on index' + ' ' +
                                        str(idx) + 'must be ' +
                                        'a square matrix.')
                    if T.shape[0] != g.nv():
                        raise TypeError('Propagation matrix must ' +
                                        'have the same dimension ' +
                                        'as the number of vertices.')
                else:
                    T = g.get_adjacency_matrix()

                nv = g.nv()
                transition_matrix[n] = (T.T / np.sum(T, axis=1)).T
                attr = g.get_labels(purpose="adjacency")
                try:
                    attributes = np.array([attr[j] for j in range(nv)])
                except TypeError:
                    raise TypeError(
                        'All attributes of a single graph should have the same dimension.'
                    )

                Attr.append(attributes)
                indexes.append(indexes[-1] + nv)
                n += 1
            try:
                P = np.vstack(Attr)
            except ValueError:
                raise ValueError(
                    'Attribute dimensions should be the same, for all graphs')

            if self._method_calling == 1:
                self._dim = P.shape[1]
            else:
                if self._dim != P.shape[1]:
                    raise ValueError('transform attribute vectors should'
                                     'have the same dimension as in fit')

            if n == 0:
                raise ValueError('Parsed input is empty')

            # feature vectors
            if self._method_calling == 1:
                # simple normal
                self._u, self._b, self._hd = list(), list(), list()
                for t in range(self.t_max):
                    u = self.random_state_.randn(self._dim)
                    if self.take_cauchy_:
                        # cauchy
                        u = np.divide(u, self.random_state_.randn(self._dim))

                    self._u.append(u)
                    # random offset
                    self._b.append(self.w *
                                   self.random_state_.randn(self._dim))

                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t],
                                                self._b[t]).tolist()

                    hd = {
                        j: i
                        for i, j in enumerate({tuple(l)
                                               for l in hashes})
                    }
                    self._hd.append(hd)

                    features = np.array([hd[tuple(l)] for l in hashes])

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(
                            features[indexes[k]:indexes[k + 1]].flat)

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]

            if self._method_calling == 3:
                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t],
                                                self._b[t]).tolist()

                    hd = dict(
                        chain(
                            iteritems(self._hd[t]),
                            iter((j, i)
                                 for i, j in enumerate(
                                     filterfalse(lambda x: x in self._hd[t],
                                                 {tuple(l)
                                                  for l in hashes}),
                                     len(self._hd[t])))))

                    features = np.array([hd[tuple(l)] for l in hashes])

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(features[indexes[k]:indexes[k +
                                                                        1]])

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]
Beispiel #6
0
    def parse_input(self, X):
        """Parse and create features for the propation kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        local_values : dict
            A dictionary of pairs between each input graph and a bins where the
            sampled graphlets have fallen.

        """
        if not isinstance(X, collections.Iterable):
            raise ValueError('input must be an iterable\n')
        else:
            i = -1
            transition_matrix = dict()
            labels = set()
            L = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 2, 3, 4]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on ' +
                                      'index: ' + str(idx))
                        continue
                    if len(x) == 2 and type(x[0]) is Graph:
                        g, T = x
                    else:
                        g = Graph(x[0], x[1], {}, self._graph_format)
                        if len(x) == 4:
                            T = x[3]
                        else:
                            T = None
                elif type(x) is Graph:
                    g, T = x, None
                else:
                    raise ValueError('Each element of X must be either a ' +
                                     'Graph or an iterable with at least 2 ' +
                                     'and at most 4 elements\n')

                if T is not None:
                    if T.shape[0] != T.shape[1]:
                        raise TypeError('Transition matrix on index' + ' ' +
                                        str(idx) + 'must be ' +
                                        'a square matrix.')
                    if T.shape[0] != g.nv():
                        raise TypeError('Propagation matrix must ' +
                                        'have the same dimension ' +
                                        'as the number of vertices.')
                else:
                    T = g.get_adjacency_matrix()

                i += 1
                transition_matrix[i] = (T.T / np.sum(T, axis=1)).T
                label = g.get_labels(purpose='adjacency')
                try:
                    labels |= set(itervalues(label))
                except TypeError:
                    raise TypeError(
                        'For a non attributed kernel, labels should be hashable.'
                    )
                L.append((g.nv(), label))

            if i == -1:
                raise ValueError('Parsed input is empty')

            # The number of parsed graphs
            n = i + 1

            # enumerate labels
            if self._method_calling == 1:
                enum_labels = {l: i for (i, l) in enumerate(list(labels))}
                self._enum_labels = enum_labels
                self._parent_labels = labels
            elif self._method_calling == 3:
                new_elements = labels - self._parent_labels
                if len(new_elements) > 0:
                    new_enum_labels = iter((l, i) for (i, l) in enumerate(
                        list(new_elements), len(self._enum_labels)))
                    enum_labels = dict(
                        chain(iteritems(self._enum_labels), new_enum_labels))
                else:
                    enum_labels = self._enum_labels

            # make a matrix for all graphs that contains label vectors
            P, data, indexes = dict(), list(), [0]
            for (k, (nv, label)) in enumerate(L):
                data += [(indexes[-1] + j, enum_labels[label[j]])
                         for j in range(nv)]
                indexes.append(indexes[-1] + nv)

            # Initialise the on hot vector
            rows, cols = zip(*data)
            P = np.zeros(shape=(indexes[-1], len(enum_labels)))
            P[rows, cols] = 1
            dim_orig = len(self._enum_labels)

            # feature vectors
            if self._method_calling == 1:
                # simple normal
                self._u, self._b, self._hd = list(), list(), list()
                for t in range(self.t_max):
                    u = self.random_state_.randn(len(enum_labels))

                    if self.take_cauchy_:
                        # cauchy
                        u = np.divide(
                            u, self.random_state_.randn(len(enum_labels)))

                    self._u.append(u)
                    # random offset
                    self._b.append(self.w * self.random_state_.rand())

                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t], self._b[t])
                    hd = dict(
                        (j, i) for i, j in enumerate(set(np.unique(hashes))))
                    self._hd.append(hd)
                    features = np.vectorize(lambda i: hd[i])(hashes)

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(features[indexes[k]:indexes[k +
                                                                        1]])

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]

            elif (self._method_calling == 3 and dim_orig >= len(enum_labels)):
                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t], self._b[t])
                    hd = dict(
                        chain(
                            iteritems(self._hd[t]),
                            iter((j, i)
                                 for i, j in enumerate(
                                     filterfalse(lambda x: x in self._hd[t],
                                                 np.unique(hashes)),
                                     len(self._hd[t])))))

                    features = np.vectorize(lambda i: hd[i])(hashes)

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(features[indexes[k]:indexes[k +
                                                                        1]])

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]

            else:
                cols = np.array(cols)
                vertices = np.where(cols < dim_orig)[0]
                vertices_p = np.where(cols >= dim_orig)[0]
                nnv = len(enum_labels) - dim_orig
                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P[vertices, :dim_orig],
                                                self._u[t], self._b[t])

                    hd = dict(
                        chain(
                            iteritems(self._hd[t]),
                            iter((j, i)
                                 for i, j in enumerate(
                                     filterfalse(lambda x: x in self._hd[t],
                                                 np.unique(hashes)),
                                     len(self._hd[t])))))

                    features = np.vectorize(lambda i: hd[i],
                                            otypes=[int])(hashes)

                    # for each the new labels graph hash P and produce the feature vectors
                    u = self.random_state_.randn(nnv)
                    if self.take_cauchy_:
                        # cauchy
                        u = np.divide(u, self.random_state_.randn(nnv))

                    u = np.hstack((self._u[t], u))

                    # calculate hashes for the remaining
                    hashes = self.calculate_LSH(P[vertices_p, :], u,
                                                self._b[t])
                    hd = dict(
                        chain(
                            iteritems(hd),
                            iter((j, i)
                                 for i, j in enumerate(hashes, len(hd)))))

                    features_p = np.vectorize(lambda i: hd[i],
                                              otypes=[int])(hashes)

                    # Accumulate the results
                    for k in range(n):
                        A = Counter(features[np.logical_and(
                            indexes[k] <= vertices,
                            vertices <= indexes[k + 1])])
                        B = Counter(features_p[np.logical_and(
                            indexes[k] <= vertices_p,
                            vertices_p <= indexes[k + 1])])
                        phi[k][t] = A + B

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                        Q = np.all(P[:, dim_orig:] > 0, axis=1)
                        vertices = np.where(~Q)[0]
                        vertices_p = np.where(Q)[0]

                return [phi[k] for k in range(n)]
Beispiel #7
0
    def parse_input(self, X):
        """Parse and create features for pyramid_match kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        H : list
            A list of lists of Histograms for all levels for each graph.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            Us = []
            if self.with_labels:
                Ls = []
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or
                                (len(x) >= 1 and not self.with_labels) or
                                (len(x) >= 2 and self.with_labels)):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    elif not self.with_labels:
                        x = Graph(x[0], {}, {}, self._graph_format)
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif not type(x) is Graph:
                    raise TypeError(
                        'each element of X must be either a graph object or a list with '
                        'at least a graph like object and node labels dict \n')
                A = x.get_adjacency_matrix()
                if self.with_labels:
                    L = x.get_labels(purpose="adjacency")
                i += 1
                if A.shape[0] == 0:
                    Us.append(np.zeros((1, self.d)))
                else:
                    # Perform eigenvalue decomposition.
                    # Rows of matrix U correspond to vertex representations
                    # Embed vertices into the d-dimensional space
                    if A.shape[0] > self.d + 1:
                        # If size of graph smaller than d, pad with zeros
                        Lambda, U = eigs(csr_matrix(A, dtype=np.float),
                                         k=self.d,
                                         ncv=10 * self.d)
                        idx = Lambda.argsort()[::-1]
                        U = U[:, idx]
                    else:
                        Lambda, U = np.linalg.eig(A)
                        idx = Lambda.argsort()[::-1]
                        U = U[:, idx]
                        U = U[:, :self.d]
                    # Replace all components by their absolute values
                    U = np.absolute(U)
                    Us.append((A.shape[0], U))
                if self.with_labels:
                    Ls.append(L)

        if i == 0:
            raise ValueError('parsed input is empty')

        if self.with_labels:
            # Map labels to values between 0 and |L|-1
            # where |L| is the number of distinct labels
            if self._method_calling in [1, 2]:
                self._num_labels = 0
                self._labels = set()
                for L in Ls:
                    self._labels |= set(itervalues(L))
                self._num_labels = len(self._labels)
                self._labels = {l: i for (i, l) in enumerate(self._labels)}
                return self._histogram_calculation(Us, Ls, self._labels)

            elif self._method_calling == 3:
                labels = set()
                for L in Ls:
                    labels |= set(itervalues(L))
                rest_labels = labels - set(self._labels.keys())
                nouveau_labels = dict(
                    chain(iteritems(self._labels), ((j, i) for (
                        i, j) in enumerate(rest_labels, len(self._labels)))))
                return self._histogram_calculation(Us, Ls, nouveau_labels)
        else:
            return self._histogram_calculation(Us)
Beispiel #8
0
    def parse_input(self, X):
        """Fast ML Graph Kernel.

        See supplementary material :cite:`kondor2016multiscale`, algorithm 1.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            A list of tuples with S matrices inverses
            and their 4th-root determinants.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ng = 0
            out = list()
            data = dict()
            neighborhoods = dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph or an iterable with at least 1 '
                                    'and at most 3 elements\n')
                phi_d = x.get_labels()
                A = x.get_adjacency_matrix()
                try:
                    phi = np.array([list(phi_d[i]) for i in range(A.shape[0])])
                except TypeError:
                    raise TypeError('Features must be iterable and castable '
                                    'in total to a numpy array.')

                Lap = laplacian(A).astype(float)
                _increment_diagonal_(Lap, self.heta)
                data[ng] = {0: A, 1: phi, 2: inv(Lap)}
                neighborhoods[ng] = x
                ng += 1

            if ng == 0:
                raise ValueError('parsed input is empty')

            # Define a function for calculating the S's of subgraphs of each iteration
            def calculate_C(k, j, l):
                if type(neighborhoods[k]) is Graph:
                    neighborhoods[k] = neighborhoods[k].produce_neighborhoods(
                        r=self.L, sort_neighbors=False)

                indexes = neighborhoods[k][l][j]
                L = laplacian(data[k][0][indexes, :][:, indexes]).astype(float)
                _increment_diagonal_(L, self.heta)
                U = data[k][1][indexes, :]
                S = multi_dot((U.T, inv(L), U))
                _increment_diagonal_(S, self.gamma)

                return (inv(S), np.sum(np.log(np.real(eigvals(S)))))

            if self._method_calling == 1:
                V = [(k, j) for k in range(ng)
                     for j in range(data[k][0].shape[0])]

                ns = min(len(V), self.n_samples)

                self.random_state_.shuffle(V)
                vs = V[:ns]
                phi_k = np.array([data[k][1][j, :] for (k, j) in vs])

                # w the eigen vectors, v the eigenvalues
                K = phi_k.dot(phi_k.T)

                # Calculate eigenvalues
                v, w = eig(K)
                v, w = np.real(v), np.real(w.T)

                # keep only the positive
                vpos = np.argpartition(v, -self.P)[-self.P:]
                vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)]

                # ksi.shape = (k, Ns) * (Ns, P)
                ksi = w[vpos].dot(phi_k).T / np.sqrt(v[vpos])
                for j in range(ng):
                    # (n_samples, k) * (k, P)
                    data[j][1] = data[j][1].dot(ksi)
                self._data_level = {0: ksi}
                for l in range(1, self.L + 1):
                    # Take random samples from all the vertices of all graphs
                    self.random_state_.shuffle(V)
                    vs = V[:ns]

                    # Compute the reference subsampled Gram matrix
                    K_proj = {
                        k: np.zeros(shape=(data[k][0].shape[0], ns))
                        for k in range(ng)
                    }
                    K, C = np.zeros(shape=(len(vs), len(vs))), dict()
                    for (m, (k, j)) in enumerate(vs):
                        C[m] = calculate_C(k, j, l)
                        K_proj[k][j, m] = K[m, m] = self.pairwise_operation(
                            C[m], C[m])
                        for (s, (k2, j2)) in enumerate(vs):
                            if s < m:
                                K[s, m] = K[m, s] \
                                        = K_proj[k2][j2, m] \
                                        = K_proj[k][j, s] \
                                        = self.pairwise_operation(C[s], C[m])
                            else:
                                break

                    # Compute the kernels of the relations of the reference to everything else
                    for (k, j) in V[ns:]:
                        for (m, _) in enumerate(vs):
                            K_proj[k][j, m] = self.pairwise_operation(
                                C[m], calculate_C(k, j, l))

                    # w the eigen vectors, v the eigenvalues
                    v, w = eig(K)
                    v, w = np.real(v), np.real(w.T)

                    # keep only the positive
                    vpos = np.argpartition(v, -self.P)[-self.P:]
                    vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)]

                    # Q shape=(k, P)
                    Q = w[vpos].T / np.sqrt(v[vpos])
                    for j in range(ng):
                        # (n, ns) * (ns, P)
                        data[j][1] = K_proj[j].dot(Q)
                    self._data_level[l] = (C, Q)

            elif self._method_calling == 3:
                ksi = self._data_level[0]
                for j in range(ng):
                    # (n, k) * (k, P)
                    data[j][1] = data[j][1].dot(ksi)

                for l in range(1, self.L + 1):
                    C, Q = self._data_level[l]
                    for j in range(ng):
                        K_proj = np.zeros(shape=(data[j][0].shape[0], len(C)))
                        for n in range(data[j][0].shape[0]):
                            for m in range(len(C)):
                                K_proj[n, m] = self.pairwise_operation(
                                    C[m], calculate_C(j, n, l))
                        data[j][1] = K_proj.dot(Q)

            # Apply the final calculation of S.
            for k in range(ng):
                S = multi_dot((data[k][1].T, data[k][2], data[k][1]))
                _increment_diagonal_(S, self.gamma)
                out.append((inv(S), np.sum(np.log(np.real(eigvals(S))))))

            return out
    def parse_input(self, X):
        """Parse and create features for multiscale_laplacian kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            Tuples consisting of the Adjacency matrix, phi, phi_outer
            dictionary of neihborhood indexes and inverse laplacians
            up to level self.L and the inverse Laplacian of A.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ng = 0
            out = list()
            start = time.time()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                ng += 1
                phi_d = x.get_labels()
                A = x.get_adjacency_matrix()
                N = x.produce_neighborhoods(r=self.L, sort_neighbors=False)
                try:
                    phi = np.array([list(phi_d[i]) for i in range(A.shape[0])])
                except TypeError:
                    raise TypeError('Features must be iterable and castable ' +
                                    'in total to a numpy array.')
                phi_outer = np.dot(phi, phi.T)

                Lap = laplacian(A).astype(float)
                _increment_diagonal_(Lap, self.heta)
                L = inv(Lap)

                Q = dict()
                for level in range(1, self.L + 1):
                    Q[level] = dict()
                    for (key, item) in iteritems(N[level]):
                        Q[level][key] = dict()
                        Q[level][key]["n"] = np.array(item)
                        if len(item) < A.shape[0]:
                            laplac = laplacian(A[item, :][:,
                                                          item]).astype(float)
                            _increment_diagonal_(laplac, self.heta)
                            laplac = inv(laplac)
                        else:
                            laplac = L
                        Q[level][key]["l"] = laplac

                out.append((A, phi, phi_outer, Q, L))

            if self.verbose:
                print("Preprocessing took:", time.time() - start, "s.")
            if ng == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #10
0
    def parse_input(self, X):
        """Parse and create features for lovasz_theta kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.


        Returns
        -------
        out : list
            The lovasz metrics for the given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            adjm = list()
            max_dim = 0
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], {}, {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                i += 1
                A = x.get_adjacency_matrix()
                adjm.append(A)
                max_dim = max(max_dim, A.shape[0])

            if self._method_calling == 1:
                if self.d_ is None:
                    self.d_ = max_dim + 1

            if self.d_ < max_dim + 1:
                if self.max_dim is None and self._method_calling == 3:
                    raise ValueError(
                        'Maximum dimension of a graph in transform is bigger '
                        'than the one found in fit. To avoid that use max_dim parameter.'
                    )
                else:
                    raise ValueError('max_dim should correspond to the '
                                     'biggest graph inside the dataset')

            out = list()
            for A in adjm:
                X, t = _calculate_lovasz_embeddings_(A)
                U = _calculate_lovasz_labelling_(X, t, self.d_)
                out.append(self._calculate_MEC_(U))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out