Beispiel #1
0
    def parse_input(self, X):
        """Parse input and create features, while initializing and/or calculating sub-kernels.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_graph_kernel : object
            Returns base_graph_kernel. Only if called from `fit` or `fit_transform`.

        K : np.array
            Returns the kernel matrix. Only if called from `transform` or
            `fit_transform`.

        """
        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx, max_core_number, core_numbers, graphs = 0, 0, [], []
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                extra = tuple()
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) >= 0:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    elif len(x) == 1:
                        x = Graph(x[0], {}, {}, graph_format="adjacency")
                    elif len(x) == 2:
                        x = Graph(x[0], x[1], {}, graph_format="adjacency")
                    elif len(x) >= 3:
                        if len(x) > 3:
                            extra += tuple(x[3:])
                        x = Graph(x[0], x[1], x[2], graph_format="adjacency")
                elif type(x) is Graph:
                    x.desired_format("adjacency")
                    x = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency",
                                     label_type="vertex",
                                     return_none=True),
                        x.get_labels(purpose="adjacency",
                                     label_type="edge",
                                     return_none=True))
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node labels '
                                    'dict \n')
                # workaround for leaving a sparse representation for x
                x.change_format(self._graph_format)
                c = core_number(x)
                max_core_number = max(max_core_number, max(c.values()))
                core_numbers.append(c)
                graphs.append((x, extra))

                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        if max_core_number <= self.min_core:
            raise ValueError(
                'The maximum core equals the min_core boundary set in init.')

        # Add the zero iteration element
        if self._method_calling == 2:
            K = np.zeros(shape=(nx, nx))
        elif self._method_calling == 3:
            self._dummy_kernel = dict()
            K = np.zeros(shape=(nx, self._nx))

        # Main
        base_graph_kernel, indexes_list = dict(), dict()
        for i in range(max_core_number, self.min_core, -1):
            subgraphs, indexes = list(), list()
            for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers,
                                                         graphs)):
                vertices = [k for k, v in iteritems(cn) if v >= i]
                if len(vertices) > 0:
                    # Calculate subgraph and store the index of the non-empty vertices
                    sg = g.get_subgraph(vertices)
                    sub_extra = list()
                    indexes.append(idx)
                    if len(extra) > 0:
                        vs = np.array(sg.get_vertices(purpose='any'))
                        for e in extra:
                            # This case will only be reached by now if the user add the propagation
                            # kernel as subkernel with a custom propagation matrix. This is a workaround!
                            if type(e) is np.array and len(e.shape) == 2:
                                e = e[vs, :][:, vs]
                            sub_extra.append(e)
                        subgraphs.append((sg, ) + tuple(sub_extra))
                    else:
                        subgraphs.append(sg)
            indexes = np.array(indexes)
            indexes_list[i] = indexes

            # calculate kernel
            if self._method_calling == 1 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                base_graph_kernel[i].fit(subgraphs)
            elif self._method_calling == 2 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs)
                for j in range(indexes.shape[0]):
                    K[indexes[j], indexes] += ft_subgraph_mat[j, :]
            elif self._method_calling == 3:
                if self._max_core_number < i or self._fit_indexes[i].shape[
                        0] == 0:
                    if len(indexes) > 0:
                        # add a dummy kernel for calculating the diagonal
                        self._dummy_kernel[i] = self.base_graph_kernel_(
                            **self.params_)
                        self._dummy_kernel[i].fit(subgraphs)
                else:
                    if indexes.shape[0] > 0:
                        subgraph_tmat = self.X[i].transform(subgraphs)
                        for j in range(indexes.shape[0]):
                            K[indexes[j],
                              self._fit_indexes[i]] += subgraph_tmat[j, :]

        if self._method_calling == 1:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return base_graph_kernel
        elif self._method_calling == 2:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return K, base_graph_kernel
        elif self._method_calling == 3:
            self._t_nx = nx
            self._max_core_number_trans = max_core_number
            self._transform_indexes = indexes_list
            return K
Beispiel #2
0
    def parse_input(self, X):
        """Parse and create features for graphlet_sampling kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            proc = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 2 ' +
                                    'and at most 3 elements\n')
                i += 1
                x.desired_format("adjacency")
                Ax = x.get_adjacency_matrix()
                Lx = x.get_labels(purpose="adjacency")
                Lx = [Lx[idx] for idx in range(Ax.shape[0])]
                proc.append((Ax, Lx, Ax.shape[0]))

            out = list()
            for Ax, Lx, s in proc:
                amss = dict()
                labels = set(Lx)
                Lx = np.array(Lx)
                for t in product(labels, labels):
                    selector = np.matmul(np.expand_dims(Lx == t[0], axis=1),
                                         np.expand_dims(Lx == t[1], axis=0))
                    amss[t] = Ax * selector
                out.append((amss, s))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
    def parse_input(self, X):
        """Parse input for weisfeiler lehman.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_kernel : object
        Returns base_kernel.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )

                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                extras[nx] = extra
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx

        # get all the distinct values of current labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = dict()
        self._inv_labels[0] = WL_labels_inverse

        def generate_graphs(label_count, WL_labels_inverse):
            new_graphs = list()
            for j in range(nx):
                new_labels = dict()
                for k in L[j].keys():
                    new_labels[k] = WL_labels_inverse[L[j][k]]
                L[j] = new_labels
                # add new labels
                new_graphs.append((Gs_ed[j], new_labels) + extras[j])
            yield new_graphs

            for i in range(1, self._n_iter):
                label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
                for j in range(nx):
                    # Find unique labels and sort
                    # them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                            str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        label_set.add(credential)

                label_list = sorted(list(label_set))
                for dv in label_list:
                    WL_labels_inverse[dv] = label_count
                    label_count += 1

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for k in L_temp[j].keys():
                        new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                    L[j] = new_labels
                    # relabel
                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
                self._inv_labels[i] = WL_labels_inverse
                yield new_graphs

        base_kernel = {
            i: self._base_kernel(**self._params)
            for i in range(self._n_iter)
        }
        if self._parallel is None:
            if self._method_calling == 1:
                for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse)):
                    base_kernel[i].fit(g)
            elif self._method_calling == 2:
                K = np.sum(
                    (base_kernel[i].fit_transform(g) for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse))),
                    axis=0)

        else:
            if self._method_calling == 1:
                self._parallel(
                    joblib.delayed(efit)(base_kernel[i], g)
                    for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse)))
            elif self._method_calling == 2:
                K = np.sum(self._parallel(
                    joblib.delayed(efit_transform)(base_kernel[i], g)
                    for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse))),
                           axis=0)

        if self._method_calling == 1:
            return base_kernel
        elif self._method_calling == 2:
            return K, base_kernel
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X', '_nx', '_inv_labels'])

        # Input validation and parsing
        if X is None:
            raise ValueError('transform input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise ValueError('input must be an iterable\n')
            else:
                nx = 0
                distinct_values = set()
                Gs_ed, L = dict(), dict()
                for (i, x) in enumerate(iter(X)):
                    is_iter = isinstance(x, collections.Iterable)
                    if is_iter:
                        x = list(x)
                    if is_iter and len(x) in [0, 2, 3]:
                        if len(x) == 0:
                            warnings.warn('Ignoring empty element on index: ' +
                                          str(i))
                            continue

                        elif len(x) in [2, 3]:
                            x = Graph(x[0], x[1], {}, self._graph_format)
                    elif type(x) is Graph:
                        x.desired_format("dictionary")
                    else:
                        raise ValueError('each element of X must have at ' +
                                         'least one and at most 3 elements\n')
                    Gs_ed[nx] = x.get_edge_dictionary()
                    L[nx] = x.get_labels(purpose="dictionary")

                    # Hold all the distinct values
                    distinct_values |= set(v for v in itervalues(L[nx])
                                           if v not in self._inv_labels[0])
                    nx += 1
                if nx == 0:
                    raise ValueError('parsed input is empty')

        nl = len(self._inv_labels[0])
        WL_labels_inverse = {
            dv: idx
            for (idx, dv) in enumerate(sorted(list(distinct_values)), nl)
        }

        def generate_graphs(WL_labels_inverse, nl):
            # calculate the kernel matrix for the 0 iteration
            new_graphs = list()
            for j in range(nx):
                new_labels = dict()
                for (k, v) in iteritems(L[j]):
                    if v in self._inv_labels[0]:
                        new_labels[k] = self._inv_labels[0][v]
                    else:
                        new_labels[k] = WL_labels_inverse[v]
                L[j] = new_labels
                # produce the new graphs
                new_graphs.append([Gs_ed[j], new_labels])
            yield new_graphs

            for i in range(1, self._n_iter):
                new_graphs = list()
                L_temp, label_set = dict(), set()
                nl += len(self._inv_labels[i])
                for j in range(nx):
                    # Find unique labels and sort them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                            str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        if credential not in self._inv_labels[i]:
                            label_set.add(credential)

                # Calculate the new label_set
                WL_labels_inverse = dict()
                if len(label_set) > 0:
                    for dv in sorted(list(label_set)):
                        idx = len(WL_labels_inverse) + nl
                        WL_labels_inverse[dv] = idx

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for (k, v) in iteritems(L_temp[j]):
                        if v in self._inv_labels[i]:
                            new_labels[k] = self._inv_labels[i][v]
                        else:
                            new_labels[k] = WL_labels_inverse[v]
                    L[j] = new_labels
                    # Create the new graphs with the new labels.
                    new_graphs.append([Gs_ed[j], new_labels])
                yield new_graphs

        if self._parallel is None:
            # Calculate the kernel matrix without parallelization
            K = np.sum(
                (self.X[i].transform(g)
                 for (i,
                      g) in enumerate(generate_graphs(WL_labels_inverse, nl))),
                axis=0)

        else:
            # Calculate the kernel marix with parallelization
            K = np.sum(self._parallel(
                joblib.delayed(etransform)(self.X[i], g)
                for (i,
                     g) in enumerate(generate_graphs(WL_labels_inverse, nl))),
                       axis=0)

        self._is_transformed = True
        if self.normalize:
            X_diag, Y_diag = self.diagonal()
            old_settings = np.seterr(divide='ignore')
            K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
            np.seterr(**old_settings)

        return K
Beispiel #5
0
    def parse_input(
        self,
        X,
    ):
        """Parse input for weisfeiler lehman.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        return_embedding_only: bool
            Whether to return the embedding of the graphs only, instead of computing the kernel all
            the way to the end.

        Returns
        -------
        base_graph_kernel : object
        Returns base_graph_kernel.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )

                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                extras[nx] = extra
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx
        WL_labels_inverse = OrderedDict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = OrderedDict(
        )  # Inverse dictionary of labels, in term of the *previous layer*
        self._inv_labels[0] = deepcopy(WL_labels_inverse)
        self.feature_dims.append(
            len(WL_labels_inverse))  # Update the zeroth iteration feature dim

        # self._inv_label_node_attr = OrderedDict()  # Inverse dictionary of labels, in term of the *node attribute*
        # self._label_node_attr = OrderedDict()  # Same as above, but with key and value inverted
        # self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(WL_labels_inverse, 0)

        # if self.node_weights is not None:
        #     self._feature_weight = OrderedDict()
        #     # Ensure the order is the same
        #     self._feature_weight[0] = self._compute_feature_weight(self.node_weights, 0, WL_labels_inverse)[1]
        # else:
        #     self._feature_weight = None

        def generate_graphs(label_count, WL_labels_inverse):
            new_graphs = list()
            for j in range(self._nx):
                new_labels = dict()
                for k in L[j].keys():
                    new_labels[k] = WL_labels_inverse[L[j][k]]
                L[j] = new_labels
                # add new labels
                new_graphs.append((Gs_ed[j], new_labels) + extras[j])
            yield new_graphs

            for i in range(1, self._h):
                label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
                for j in range(nx):
                    # Find unique labels and sort
                    # them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                                     str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        label_set.add(credential)

                label_list = sorted(list(label_set))
                for dv in label_list:
                    WL_labels_inverse[dv] = label_count
                    label_count += 1

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for k in L_temp[j].keys():
                        new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                    L[j] = new_labels
                    # relabel
                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
                self._inv_labels[i] = WL_labels_inverse
                # Compute the translated inverse node label
                # self._label_node_attr[i], self._inv_label_node_attr[i] = self.translate_label(WL_labels_inverse, i, self._label_node_attr[i - 1])
                # self.feature_dims.append(self.feature_dims[-1] + len(self._label_node_attr[i]))
                # Compute the feature weight of the current layer
                # if self.node_weights is not None:
                #     self._feature_weight[i] = self._compute_feature_weight(self.node_weights, i, self._inv_label_node_attr[i])[1]
                # assert len(self._feature_weight[i] == len(WL_labels_inverse))
                yield new_graphs

        # Initialise the base graph kernel.
        base_graph_kernel = {}

        K = []
        for (i, g) in enumerate(generate_graphs(label_count,
                                                WL_labels_inverse)):
            param = self._params
            # if self._feature_weight is not None:
            # print(self._feature_weight)
            # param.update({'mahalanobis_precision': self._feature_weight[i]})
            base_graph_kernel.update({i: self._base_graph_kernel(**param)})
            # if return_embedding_only:
            #     K.append(base_graph_kernel[i].parse_input(
            #         g, label_start_idx=self.feature_dims[i], label_end_idx=self.feature_dims[i + 1]))
            # else:
            if self._method_calling == 1:
                base_graph_kernel[i].fit(g, )
            else:
                K.append(base_graph_kernel[i].fit_transform(g, ))

        # if return_embedding_only:
        #     return K
        if self._method_calling == 1:
            return base_graph_kernel
        elif self._method_calling == 2:
            # if self.as_tensor:
            #     K = torch.stack(K, dim=0).sum(dim=0)
            #     return K, base_graph_kernel
            return np.sum(K, axis=0), base_graph_kernel
Beispiel #6
0
    def parse_input(self, X):
        """Fast ML Graph Kernel.

        See supplementary material :cite:`kondor2016multiscale`, algorithm 1.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            A list of tuples with S matrices inverses
            and their 4th-root determinants.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ng = 0
            out = list()
            data = dict()
            neighborhoods = dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph or an iterable with at least 1 '
                                    'and at most 3 elements\n')
                phi_d = x.get_labels()
                A = x.get_adjacency_matrix()
                try:
                    phi = np.array([list(phi_d[i]) for i in range(A.shape[0])])
                except TypeError:
                    raise TypeError('Features must be iterable and castable '
                                    'in total to a numpy array.')

                Lap = laplacian(A).astype(float)
                _increment_diagonal_(Lap, self.heta)
                data[ng] = {0: A, 1: phi, 2: inv(Lap)}
                neighborhoods[ng] = x
                ng += 1

            if ng == 0:
                raise ValueError('parsed input is empty')

            # Define a function for calculating the S's of subgraphs of each iteration
            def calculate_C(k, j, l):
                if type(neighborhoods[k]) is Graph:
                    neighborhoods[k] = neighborhoods[k].produce_neighborhoods(
                        r=self.L, sort_neighbors=False)

                indexes = neighborhoods[k][l][j]
                L = laplacian(data[k][0][indexes, :][:, indexes]).astype(float)
                _increment_diagonal_(L, self.heta)
                U = data[k][1][indexes, :]
                S = multi_dot((U.T, inv(L), U))
                _increment_diagonal_(S, self.gamma)

                return (inv(S), np.sum(np.log(np.real(eigvals(S)))))

            if self._method_calling == 1:
                V = [(k, j) for k in range(ng)
                     for j in range(data[k][0].shape[0])]

                ns = min(len(V), self.n_samples)

                self.random_state_.shuffle(V)
                vs = V[:ns]
                phi_k = np.array([data[k][1][j, :] for (k, j) in vs])

                # w the eigen vectors, v the eigenvalues
                K = phi_k.dot(phi_k.T)

                # Calculate eigenvalues
                v, w = eig(K)
                v, w = np.real(v), np.real(w.T)

                # keep only the positive
                vpos = np.argpartition(v, -self.P)[-self.P:]
                vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)]

                # ksi.shape = (k, Ns) * (Ns, P)
                ksi = w[vpos].dot(phi_k).T / np.sqrt(v[vpos])
                for j in range(ng):
                    # (n_samples, k) * (k, P)
                    data[j][1] = data[j][1].dot(ksi)
                self._data_level = {0: ksi}
                for l in range(1, self.L + 1):
                    # Take random samples from all the vertices of all graphs
                    self.random_state_.shuffle(V)
                    vs = V[:ns]

                    # Compute the reference subsampled Gram matrix
                    K_proj = {
                        k: np.zeros(shape=(data[k][0].shape[0], ns))
                        for k in range(ng)
                    }
                    K, C = np.zeros(shape=(len(vs), len(vs))), dict()
                    for (m, (k, j)) in enumerate(vs):
                        C[m] = calculate_C(k, j, l)
                        K_proj[k][j, m] = K[m, m] = self.pairwise_operation(
                            C[m], C[m])
                        for (s, (k2, j2)) in enumerate(vs):
                            if s < m:
                                K[s, m] = K[m, s] \
                                        = K_proj[k2][j2, m] \
                                        = K_proj[k][j, s] \
                                        = self.pairwise_operation(C[s], C[m])
                            else:
                                break

                    # Compute the kernels of the relations of the reference to everything else
                    for (k, j) in V[ns:]:
                        for (m, _) in enumerate(vs):
                            K_proj[k][j, m] = self.pairwise_operation(
                                C[m], calculate_C(k, j, l))

                    # w the eigen vectors, v the eigenvalues
                    v, w = eig(K)
                    v, w = np.real(v), np.real(w.T)

                    # keep only the positive
                    vpos = np.argpartition(v, -self.P)[-self.P:]
                    vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)]

                    # Q shape=(k, P)
                    Q = w[vpos].T / np.sqrt(v[vpos])
                    for j in range(ng):
                        # (n, ns) * (ns, P)
                        data[j][1] = K_proj[j].dot(Q)
                    self._data_level[l] = (C, Q)

            elif self._method_calling == 3:
                ksi = self._data_level[0]
                for j in range(ng):
                    # (n, k) * (k, P)
                    data[j][1] = data[j][1].dot(ksi)

                for l in range(1, self.L + 1):
                    C, Q = self._data_level[l]
                    for j in range(ng):
                        K_proj = np.zeros(shape=(data[j][0].shape[0], len(C)))
                        for n in range(data[j][0].shape[0]):
                            for m in range(len(C)):
                                K_proj[n, m] = self.pairwise_operation(
                                    C[m], calculate_C(j, n, l))
                        data[j][1] = K_proj.dot(Q)

            # Apply the final calculation of S.
            for k in range(ng):
                S = multi_dot((data[k][1].T, data[k][2], data[k][1]))
                _increment_diagonal_(S, self.gamma)
                out.append((inv(S), np.sum(np.log(np.real(eigvals(S))))))

            return out
    def parse_input(self, X):
        """Parse and create features for multiscale_laplacian kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            Tuples consisting of the Adjacency matrix, phi, phi_outer
            dictionary of neihborhood indexes and inverse laplacians
            up to level self.L and the inverse Laplacian of A.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ng = 0
            out = list()
            start = time.time()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                ng += 1
                phi_d = x.get_labels()
                A = x.get_adjacency_matrix()
                N = x.produce_neighborhoods(r=self.L, sort_neighbors=False)
                try:
                    phi = np.array([list(phi_d[i]) for i in range(A.shape[0])])
                except TypeError:
                    raise TypeError('Features must be iterable and castable ' +
                                    'in total to a numpy array.')
                phi_outer = np.dot(phi, phi.T)

                Lap = laplacian(A).astype(float)
                _increment_diagonal_(Lap, self.heta)
                L = inv(Lap)

                Q = dict()
                for level in range(1, self.L + 1):
                    Q[level] = dict()
                    for (key, item) in iteritems(N[level]):
                        Q[level][key] = dict()
                        Q[level][key]["n"] = np.array(item)
                        if len(item) < A.shape[0]:
                            laplac = laplacian(A[item, :][:,
                                                          item]).astype(float)
                            _increment_diagonal_(laplac, self.heta)
                            laplac = inv(laplac)
                        else:
                            laplac = L
                        Q[level][key]["l"] = laplac

                out.append((A, phi, phi_outer, Q, L))

            if self.verbose:
                print("Preprocessing took:", time.time() - start, "s.")
            if ng == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #8
0
    def parse_input(self, X):
        """Parse input for weisfeiler lehman optimal assignment.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        Hs : numpy array, shape = [n_input_graphs, hierarchy_size]
            An array where the rows contain the histograms of the graphs.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values = dict(), dict(), set()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge", return_none=True), ) + extra
                        else:
                            x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx

        # Initialize hierarchy
        self._hierarchy = dict()
        self._hierarchy['root'] = dict()
        self._hierarchy['root']['parent'] = None
        self._hierarchy['root']['children'] = list()
        self._hierarchy['root']['w'] = 0
        self._hierarchy['root']['omega'] = 0

        # get all the distinct values of current labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            self._insert_into_hierarchy(label_count, 'root')
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = dict()
        self._inv_labels[0] = WL_labels_inverse

        for j in range(nx):
            new_labels = dict()
            for k in L[j].keys():
                new_labels[k] = WL_labels_inverse[L[j][k]]
            L[j] = new_labels

        for i in range(1, self._n_iter):
            new_previous_label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
            for j in range(nx):
                # Find unique labels and sort
                # them for both graphs
                # Keep for each node the temporary
                L_temp[j] = dict()
                for v in Gs_ed[j].keys():
                    credential = str(L[j][v]) + "," + \
                        str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                    L_temp[j][v] = credential
                    new_previous_label_set.add((credential, L[j][v]))

            label_list = sorted(list(new_previous_label_set), key=lambda tup: tup[0])
            for dv, previous_label in label_list:
                WL_labels_inverse[dv] = label_count
                self._insert_into_hierarchy(label_count, previous_label)
                label_count += 1

            # Recalculate labels
            for j in range(nx):
                new_labels = dict()
                for k in L_temp[j].keys():
                    new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                L[j] = new_labels
            self._inv_labels[i] = WL_labels_inverse

        # Compute the vector representation of each graph
        if self.sparse:
            Hs = lil_matrix((nx, len(self._hierarchy)))
        else:
            Hs = np.zeros((nx, len(self._hierarchy)))
        for j in range(nx):
            for k in L[j].keys():
                current_label = L[j][k]
                while self._hierarchy[current_label]['parent'] is not None:
                    Hs[j, current_label] += self._hierarchy[current_label]['omega']
                    current_label = self._hierarchy[current_label]['parent']

        return Hs
Beispiel #9
0
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X', '_nx', '_hierarchy', '_inv_labels'])

        # Input validation and parsing
        if X is None:
            raise ValueError('transform input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise ValueError('input must be an iterable\n')
            else:
                nx = 0
                distinct_values = set()
                Gs_ed, L = dict(), dict()
                for (i, x) in enumerate(iter(X)):
                    is_iter = isinstance(x, collections.Iterable)
                    if is_iter:
                        x = list(x)
                    if is_iter and len(x) in [0, 2, 3]:
                        if len(x) == 0:
                            warnings.warn('Ignoring empty element on index: '
                                          + str(i))
                            continue

                        elif len(x) in [2, 3]:
                            x = Graph(x[0], x[1], {}, self._graph_format)
                    elif type(x) is Graph:
                        x.desired_format("dictionary")
                    else:
                        raise ValueError('each element of X must have at ' +
                                         'least one and at most 3 elements\n')
                    Gs_ed[nx] = x.get_edge_dictionary()
                    L[nx] = x.get_labels(purpose="dictionary")

                    # Hold all the distinct values
                    distinct_values |= set(
                        v for v in itervalues(L[nx])
                        if v not in self._inv_labels[0])
                    nx += 1
                if nx == 0:
                    raise ValueError('parsed input is empty')

        # get all the distinct values of new labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = sum([len(self._inv_labels[i]) for i in range(len(self._inv_labels))])
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            self._insert_into_hierarchy(label_count, 'root')
            label_count += 1

        for j in range(nx):
            new_labels = dict()
            for (k, v) in iteritems(L[j]):
                if v in self._inv_labels[0]:
                    new_labels[k] = self._inv_labels[0][v]
                else:
                    new_labels[k] = WL_labels_inverse[v]
            L[j] = new_labels

        for i in range(1, self._n_iter):
            L_temp, new_previous_label_set = dict(), set()
            for j in range(nx):
                # Find unique labels and sort them for both graphs
                # Keep for each node the temporary
                L_temp[j] = dict()
                for v in Gs_ed[j].keys():
                    credential = str(L[j][v]) + "," + \
                        str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                    L_temp[j][v] = credential
                    if credential not in self._inv_labels[i]:
                        new_previous_label_set.add((credential, L[j][v]))

            # Calculate the new label_set
            WL_labels_inverse = dict()
            if len(new_previous_label_set) > 0:
                for dv, previous_label in sorted(list(new_previous_label_set), key=lambda tup: tup[0]):
                    WL_labels_inverse[dv] = label_count
                    self._insert_into_hierarchy(label_count, previous_label)
                    label_count += 1

            # Recalculate labels
            for j in range(nx):
                new_labels = dict()
                for (k, v) in iteritems(L_temp[j]):
                    if v in self._inv_labels[i]:
                        new_labels[k] = self._inv_labels[i][v]
                    else:
                        new_labels[k] = WL_labels_inverse[v]
                L[j] = new_labels

        # Compute the vector representation of each graph
        if self.sparse:
            Hs = lil_matrix((nx, len(self._hierarchy)))
        else:
            Hs = np.zeros((nx, len(self._hierarchy)))
        for j in range(nx):
            for k in L[j].keys():
                current_label = L[j][k]
                while self._hierarchy[current_label]['parent'] is not None:
                    Hs[j, current_label] += self._hierarchy[current_label]['omega']
                    current_label = self._hierarchy[current_label]['parent']

        self.Y = Hs

        # Compute the histogram intersection kernel
        K = np.zeros((nx, self._nx))
        if self.sparse:
            for i in range(self._nx):
                for j in range(i, self._nx):
                    K[i, j] = np.sum(Hs[i, :self.X.shape[1]].minimum(self.X[j, :]))
        else:
            for i in range(nx):
                for j in range(self._nx):
                    K[i, j] = np.sum(np.min([Hs[i, :self.X.shape[1]], self.X[j, :]], axis=0))

        self._is_transformed = True
        if self.normalize:
            X_diag, Y_diag = self.diagonal()
            old_settings = np.seterr(divide='ignore')
            K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
            np.seterr(**old_settings)

        return K