def parse_input(self, X):
        """Parse and create features for the `subgraph_matching` kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter = True
                    x = list(x)

                if type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency"),
                        x.get_labels(purpose="adjacency", label_type="edge"),
                        self._graph_format)
                elif is_iter and len(x) in [0, 3]:
                    x = list(x)
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    elif len(x) == 3:
                        g = Graph(x[0], x[1], x[2], "adjacency")
                        g.change_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node, ' +
                                    'edge labels dict \n')
                n = g.nv()
                E = g.get_edge_dictionary()
                L = g.get_labels(purpose="dictionary",
                                 return_none=(self.kv is None))
                Le = g.get_labels(purpose="dictionary",
                                  label_type="edge",
                                  return_none=(self.ke is None))
                Er = set(
                    (a, b) for a in E.keys() for b in E[a].keys() if a != b)

                i += 1
                out.append((n, Er, L, Le))

            if i == 0:
                raise ValueError('parsed input is empty')
            return out
    def parse_input(self, X):
        """Parse input for weisfeiler lehman.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_kernel : object
        Returns base_kernel.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )

                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                extras[nx] = extra
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx

        # get all the distinct values of current labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = dict()
        self._inv_labels[0] = WL_labels_inverse

        def generate_graphs(label_count, WL_labels_inverse):
            new_graphs = list()
            for j in range(nx):
                new_labels = dict()
                for k in L[j].keys():
                    new_labels[k] = WL_labels_inverse[L[j][k]]
                L[j] = new_labels
                # add new labels
                new_graphs.append((Gs_ed[j], new_labels) + extras[j])
            yield new_graphs

            for i in range(1, self._n_iter):
                label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
                for j in range(nx):
                    # Find unique labels and sort
                    # them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                            str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        label_set.add(credential)

                label_list = sorted(list(label_set))
                for dv in label_list:
                    WL_labels_inverse[dv] = label_count
                    label_count += 1

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for k in L_temp[j].keys():
                        new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                    L[j] = new_labels
                    # relabel
                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
                self._inv_labels[i] = WL_labels_inverse
                yield new_graphs

        base_kernel = {
            i: self._base_kernel(**self._params)
            for i in range(self._n_iter)
        }
        if self._parallel is None:
            if self._method_calling == 1:
                for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse)):
                    base_kernel[i].fit(g)
            elif self._method_calling == 2:
                K = np.sum(
                    (base_kernel[i].fit_transform(g) for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse))),
                    axis=0)

        else:
            if self._method_calling == 1:
                self._parallel(
                    joblib.delayed(efit)(base_kernel[i], g)
                    for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse)))
            elif self._method_calling == 2:
                K = np.sum(self._parallel(
                    joblib.delayed(efit_transform)(base_kernel[i], g)
                    for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse))),
                           axis=0)

        if self._method_calling == 1:
            return base_kernel
        elif self._method_calling == 2:
            return K, base_kernel
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X', '_nx', '_inv_labels'])

        # Input validation and parsing
        if X is None:
            raise ValueError('transform input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise ValueError('input must be an iterable\n')
            else:
                nx = 0
                distinct_values = set()
                Gs_ed, L = dict(), dict()
                for (i, x) in enumerate(iter(X)):
                    is_iter = isinstance(x, collections.Iterable)
                    if is_iter:
                        x = list(x)
                    if is_iter and len(x) in [0, 2, 3]:
                        if len(x) == 0:
                            warnings.warn('Ignoring empty element on index: ' +
                                          str(i))
                            continue

                        elif len(x) in [2, 3]:
                            x = Graph(x[0], x[1], {}, self._graph_format)
                    elif type(x) is Graph:
                        x.desired_format("dictionary")
                    else:
                        raise ValueError('each element of X must have at ' +
                                         'least one and at most 3 elements\n')
                    Gs_ed[nx] = x.get_edge_dictionary()
                    L[nx] = x.get_labels(purpose="dictionary")

                    # Hold all the distinct values
                    distinct_values |= set(v for v in itervalues(L[nx])
                                           if v not in self._inv_labels[0])
                    nx += 1
                if nx == 0:
                    raise ValueError('parsed input is empty')

        nl = len(self._inv_labels[0])
        WL_labels_inverse = {
            dv: idx
            for (idx, dv) in enumerate(sorted(list(distinct_values)), nl)
        }

        def generate_graphs(WL_labels_inverse, nl):
            # calculate the kernel matrix for the 0 iteration
            new_graphs = list()
            for j in range(nx):
                new_labels = dict()
                for (k, v) in iteritems(L[j]):
                    if v in self._inv_labels[0]:
                        new_labels[k] = self._inv_labels[0][v]
                    else:
                        new_labels[k] = WL_labels_inverse[v]
                L[j] = new_labels
                # produce the new graphs
                new_graphs.append([Gs_ed[j], new_labels])
            yield new_graphs

            for i in range(1, self._n_iter):
                new_graphs = list()
                L_temp, label_set = dict(), set()
                nl += len(self._inv_labels[i])
                for j in range(nx):
                    # Find unique labels and sort them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                            str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        if credential not in self._inv_labels[i]:
                            label_set.add(credential)

                # Calculate the new label_set
                WL_labels_inverse = dict()
                if len(label_set) > 0:
                    for dv in sorted(list(label_set)):
                        idx = len(WL_labels_inverse) + nl
                        WL_labels_inverse[dv] = idx

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for (k, v) in iteritems(L_temp[j]):
                        if v in self._inv_labels[i]:
                            new_labels[k] = self._inv_labels[i][v]
                        else:
                            new_labels[k] = WL_labels_inverse[v]
                    L[j] = new_labels
                    # Create the new graphs with the new labels.
                    new_graphs.append([Gs_ed[j], new_labels])
                yield new_graphs

        if self._parallel is None:
            # Calculate the kernel matrix without parallelization
            K = np.sum(
                (self.X[i].transform(g)
                 for (i,
                      g) in enumerate(generate_graphs(WL_labels_inverse, nl))),
                axis=0)

        else:
            # Calculate the kernel marix with parallelization
            K = np.sum(self._parallel(
                joblib.delayed(etransform)(self.X[i], g)
                for (i,
                     g) in enumerate(generate_graphs(WL_labels_inverse, nl))),
                       axis=0)

        self._is_transformed = True
        if self.normalize:
            X_diag, Y_diag = self.diagonal()
            old_settings = np.seterr(divide='ignore')
            K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
            np.seterr(**old_settings)

        return K
Beispiel #4
0
    def parse_input(
        self,
        X,
    ):
        """Parse input for weisfeiler lehman.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        return_embedding_only: bool
            Whether to return the embedding of the graphs only, instead of computing the kernel all
            the way to the end.

        Returns
        -------
        base_graph_kernel : object
        Returns base_graph_kernel.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )

                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                extras[nx] = extra
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx
        WL_labels_inverse = OrderedDict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = OrderedDict(
        )  # Inverse dictionary of labels, in term of the *previous layer*
        self._inv_labels[0] = deepcopy(WL_labels_inverse)
        self.feature_dims.append(
            len(WL_labels_inverse))  # Update the zeroth iteration feature dim

        # self._inv_label_node_attr = OrderedDict()  # Inverse dictionary of labels, in term of the *node attribute*
        # self._label_node_attr = OrderedDict()  # Same as above, but with key and value inverted
        # self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(WL_labels_inverse, 0)

        # if self.node_weights is not None:
        #     self._feature_weight = OrderedDict()
        #     # Ensure the order is the same
        #     self._feature_weight[0] = self._compute_feature_weight(self.node_weights, 0, WL_labels_inverse)[1]
        # else:
        #     self._feature_weight = None

        def generate_graphs(label_count, WL_labels_inverse):
            new_graphs = list()
            for j in range(self._nx):
                new_labels = dict()
                for k in L[j].keys():
                    new_labels[k] = WL_labels_inverse[L[j][k]]
                L[j] = new_labels
                # add new labels
                new_graphs.append((Gs_ed[j], new_labels) + extras[j])
            yield new_graphs

            for i in range(1, self._h):
                label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
                for j in range(nx):
                    # Find unique labels and sort
                    # them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                                     str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        label_set.add(credential)

                label_list = sorted(list(label_set))
                for dv in label_list:
                    WL_labels_inverse[dv] = label_count
                    label_count += 1

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for k in L_temp[j].keys():
                        new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                    L[j] = new_labels
                    # relabel
                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
                self._inv_labels[i] = WL_labels_inverse
                # Compute the translated inverse node label
                # self._label_node_attr[i], self._inv_label_node_attr[i] = self.translate_label(WL_labels_inverse, i, self._label_node_attr[i - 1])
                # self.feature_dims.append(self.feature_dims[-1] + len(self._label_node_attr[i]))
                # Compute the feature weight of the current layer
                # if self.node_weights is not None:
                #     self._feature_weight[i] = self._compute_feature_weight(self.node_weights, i, self._inv_label_node_attr[i])[1]
                # assert len(self._feature_weight[i] == len(WL_labels_inverse))
                yield new_graphs

        # Initialise the base graph kernel.
        base_graph_kernel = {}

        K = []
        for (i, g) in enumerate(generate_graphs(label_count,
                                                WL_labels_inverse)):
            param = self._params
            # if self._feature_weight is not None:
            # print(self._feature_weight)
            # param.update({'mahalanobis_precision': self._feature_weight[i]})
            base_graph_kernel.update({i: self._base_graph_kernel(**param)})
            # if return_embedding_only:
            #     K.append(base_graph_kernel[i].parse_input(
            #         g, label_start_idx=self.feature_dims[i], label_end_idx=self.feature_dims[i + 1]))
            # else:
            if self._method_calling == 1:
                base_graph_kernel[i].fit(g, )
            else:
                K.append(base_graph_kernel[i].fit_transform(g, ))

        # if return_embedding_only:
        #     return K
        if self._method_calling == 1:
            return base_graph_kernel
        elif self._method_calling == 2:
            # if self.as_tensor:
            #     K = torch.stack(K, dim=0).sum(dim=0)
            #     return K, base_graph_kernel
            return np.sum(K, axis=0), base_graph_kernel
Beispiel #5
0
    def parse_input(self, X):
        """Parse and create features for the NSPD kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        M : dict
            A dictionary with keys all the distances from 0 to self.d
            and values the the np.arrays with rows corresponding to the
            non-null input graphs and columns to the enumerations of tuples
            consisting of pairs of hash values and radius, from all the given
            graphs of the input (plus the fitted one's on transform).

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            # Hold the number of graphs
            ng = 0

            # Holds all the data for combinations of r, d
            data = collections.defaultdict(dict)

            # Index all keys for combinations of r, d
            all_keys = collections.defaultdict(dict)
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        g = Graph(x[0], x[1], x[2])
                        g.change_format("adjacency")
                elif type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency", label_type="vertex"),
                        x.get_labels(purpose="adjacency", label_type="edge"))
                else:
                    raise TypeError('each element of X must have either ' +
                                    'a graph with labels for node and edge ' +
                                    'or 3 elements consisting of a graph ' +
                                    'type object, labels for vertices and ' +
                                    'labels for edges.')

                # Bring to the desired format
                g.change_format(self._graph_format)

                # Take the vertices
                vertices = set(g.get_vertices(purpose=self._graph_format))

                # Extract the dicitionary
                ed = g.get_edge_dictionary()

                # Convert edges to tuples
                edges = {(j, k) for j in ed.keys() for k in ed[j].keys()}

                # Extract labels for nodes
                Lv = g.get_labels(purpose=self._graph_format)
                # and for edges
                Le = g.get_labels(purpose=self._graph_format,
                                  label_type="edge")

                # Produce all the neighborhoods and the distance pairs
                # up to the desired radius and maximum distance
                N, D, D_pair = g.produce_neighborhoods(self.r,
                                                       purpose="dictionary",
                                                       with_distances=True,
                                                       d=self.d)

                # Hash all the neighborhoods
                H = self._hash_neighborhoods(vertices, edges, Lv, Le, N,
                                             D_pair)

                if self._method_calling == 1:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            for r in range(self.r + 1):
                                key = (H[r, A], H[r, B])
                                keys = all_keys[r, d]
                                idx = keys.get(key, None)
                                if idx is None:
                                    idx = len(keys)
                                    keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1

                elif self._method_calling == 3:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            # Based on the edges of the bidirected graph
                            for r in range(self.r + 1):
                                keys = all_keys[r, d]
                                fit_keys = self._fit_keys[r, d]
                                key = (H[r, A], H[r, B])
                                idx = fit_keys.get(key, None)
                                if idx is None:
                                    idx = keys.get(key, None)
                                    if idx is None:
                                        idx = len(keys) + len(fit_keys)
                                        keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1
                ng += 1
            if ng == 0:
                raise ValueError('parsed input is empty')

            if self._method_calling == 1:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix((data, (rows, cols)),
                                        shape=(ng, len(all_keys[key])),
                                        dtype=np.int64)
                self._fit_keys = all_keys
                self._ngx = ng

            elif self._method_calling == 3:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix(
                        (data, (rows, cols)),
                        shape=(ng,
                               len(all_keys[key]) + len(self._fit_keys[key])),
                        dtype=np.int64)

                self._ngy = ng

            return M
Beispiel #6
0
    def parse_input(self, X):
        """Parse input and create features, while initializing and/or calculating sub-kernels.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_graph_kernel : object
            Returns base_graph_kernel. Only if called from `fit` or `fit_transform`.

        K : np.array
            Returns the kernel matrix. Only if called from `transform` or
            `fit_transform`.

        """
        if self.base_graph_kernel_ is None:
            raise ValueError('User must provide a base_graph_kernel')
        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx, labels = 0, list()
            if self._method_calling in [1, 2]:
                nl, labels_enum, base_graph_kernel = 0, dict(), dict()
                for kidx in range(self.n_iter):
                    base_graph_kernel[kidx] = self.base_graph_kernel_[0](
                        **self.base_graph_kernel_[1])
            elif self._method_calling == 3:
                nl, labels_enum, base_graph_kernel = len(
                    self._labels_enum), dict(self._labels_enum), self.X
            inp = list()
            neighbors = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose='any',
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()
                elif type(x) is Graph:
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')

                label = x.get_labels(purpose='any')
                inp.append((x.get_graph_object(), extra))
                neighbors.append(x.get_edge_dictionary())
                labels.append(label)
                for v in set(itervalues(label)):
                    if v not in labels_enum:
                        labels_enum[v] = nl
                        nl += 1
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Calculate the hadamard matrix
        H = hadamard(int(2**(ceil(log2(nl)))))

        def generate_graphs(labels):
            # Intial labeling of vertices based on their corresponding Hadamard code (i-th row of the
            # Hadamard matrix) where i is the i-th label on enumeration
            new_graphs, new_labels = list(), list()
            for ((obj, extra), label) in zip(inp, labels):
                new_label = dict()
                for (k, v) in iteritems(label):
                    new_label[k] = H[labels_enum[v], :]
                new_graphs.append(
                    (obj, {i: tuple(j)
                           for (i, j) in iteritems(new_label)}) + extra)
                new_labels.append(new_label)

            yield new_graphs
            # Main
            for i in range(1, self.n_iter):
                new_graphs, labels, new_labels = list(), new_labels, list()
                for ((obj, extra), neighbor,
                     old_label) in zip(inp, neighbors, labels):
                    # Find unique labels and sort them for both graphs and keep for each node
                    # the temporary
                    new_label = dict()
                    for (k, ns) in iteritems(neighbor):
                        new_label[k] = old_label[k]
                        for q in ns:
                            new_label[k] = np.add(new_label[k], old_label[q])
                    new_labels.append(new_label)
                    new_graphs.append(
                        (obj, {i: tuple(j)
                               for (i, j) in iteritems(new_label)}) + extra)
                yield new_graphs

        if self._method_calling in [1, 2]:
            base_graph_kernel = {
                i: self.base_graph_kernel_[0](**self.base_graph_kernel_[1])
                for i in range(self.n_iter)
            }

        if self._parallel is None:
            # Add the zero iteration element
            if self._method_calling == 1:
                for (i, g) in enumerate(generate_graphs(labels)):
                    base_graph_kernel[i].fit(g)
            elif self._method_calling == 2:
                K = np.sum((base_graph_kernel[i].fit_transform(g)
                            for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)
            elif self._method_calling == 3:
                # Calculate the kernel matrix without parallelization
                K = np.sum((self.X[i].transform(g)
                            for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)

        else:
            if self._method_calling == 1:
                self._parallel(
                    joblib.delayed(efit)(base_graph_kernel[i], g)
                    for (i, g) in enumerate(generate_graphs(labels)))
            elif self._method_calling == 2:
                # Calculate the kernel marix with parallelization
                K = np.sum(self._parallel(
                    joblib.delayed(efit_transform)(base_graph_kernel[i], g)
                    for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)
            elif self._method_calling == 3:
                # Calculate the kernel marix with parallelization
                K = np.sum(self._parallel(
                    joblib.delayed(etransform)(self.X[i], g)
                    for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)

        if self._method_calling == 1:
            self._labels_enum = labels_enum
            return base_graph_kernel
        elif self._method_calling == 2:
            self._labels_enum = labels_enum
            return K, base_graph_kernel
        elif self._method_calling == 3:
            return K
Beispiel #7
0
    def parse_input(self, X):
        """Parse input for weisfeiler lehman optimal assignment.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        Hs : numpy array, shape = [n_input_graphs, hierarchy_size]
            An array where the rows contain the histograms of the graphs.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values = dict(), dict(), set()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge", return_none=True), ) + extra
                        else:
                            x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx

        # Initialize hierarchy
        self._hierarchy = dict()
        self._hierarchy['root'] = dict()
        self._hierarchy['root']['parent'] = None
        self._hierarchy['root']['children'] = list()
        self._hierarchy['root']['w'] = 0
        self._hierarchy['root']['omega'] = 0

        # get all the distinct values of current labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            self._insert_into_hierarchy(label_count, 'root')
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = dict()
        self._inv_labels[0] = WL_labels_inverse

        for j in range(nx):
            new_labels = dict()
            for k in L[j].keys():
                new_labels[k] = WL_labels_inverse[L[j][k]]
            L[j] = new_labels

        for i in range(1, self._n_iter):
            new_previous_label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
            for j in range(nx):
                # Find unique labels and sort
                # them for both graphs
                # Keep for each node the temporary
                L_temp[j] = dict()
                for v in Gs_ed[j].keys():
                    credential = str(L[j][v]) + "," + \
                        str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                    L_temp[j][v] = credential
                    new_previous_label_set.add((credential, L[j][v]))

            label_list = sorted(list(new_previous_label_set), key=lambda tup: tup[0])
            for dv, previous_label in label_list:
                WL_labels_inverse[dv] = label_count
                self._insert_into_hierarchy(label_count, previous_label)
                label_count += 1

            # Recalculate labels
            for j in range(nx):
                new_labels = dict()
                for k in L_temp[j].keys():
                    new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                L[j] = new_labels
            self._inv_labels[i] = WL_labels_inverse

        # Compute the vector representation of each graph
        if self.sparse:
            Hs = lil_matrix((nx, len(self._hierarchy)))
        else:
            Hs = np.zeros((nx, len(self._hierarchy)))
        for j in range(nx):
            for k in L[j].keys():
                current_label = L[j][k]
                while self._hierarchy[current_label]['parent'] is not None:
                    Hs[j, current_label] += self._hierarchy[current_label]['omega']
                    current_label = self._hierarchy[current_label]['parent']

        return Hs
Beispiel #8
0
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X', '_nx', '_hierarchy', '_inv_labels'])

        # Input validation and parsing
        if X is None:
            raise ValueError('transform input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise ValueError('input must be an iterable\n')
            else:
                nx = 0
                distinct_values = set()
                Gs_ed, L = dict(), dict()
                for (i, x) in enumerate(iter(X)):
                    is_iter = isinstance(x, collections.Iterable)
                    if is_iter:
                        x = list(x)
                    if is_iter and len(x) in [0, 2, 3]:
                        if len(x) == 0:
                            warnings.warn('Ignoring empty element on index: '
                                          + str(i))
                            continue

                        elif len(x) in [2, 3]:
                            x = Graph(x[0], x[1], {}, self._graph_format)
                    elif type(x) is Graph:
                        x.desired_format("dictionary")
                    else:
                        raise ValueError('each element of X must have at ' +
                                         'least one and at most 3 elements\n')
                    Gs_ed[nx] = x.get_edge_dictionary()
                    L[nx] = x.get_labels(purpose="dictionary")

                    # Hold all the distinct values
                    distinct_values |= set(
                        v for v in itervalues(L[nx])
                        if v not in self._inv_labels[0])
                    nx += 1
                if nx == 0:
                    raise ValueError('parsed input is empty')

        # get all the distinct values of new labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = sum([len(self._inv_labels[i]) for i in range(len(self._inv_labels))])
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            self._insert_into_hierarchy(label_count, 'root')
            label_count += 1

        for j in range(nx):
            new_labels = dict()
            for (k, v) in iteritems(L[j]):
                if v in self._inv_labels[0]:
                    new_labels[k] = self._inv_labels[0][v]
                else:
                    new_labels[k] = WL_labels_inverse[v]
            L[j] = new_labels

        for i in range(1, self._n_iter):
            L_temp, new_previous_label_set = dict(), set()
            for j in range(nx):
                # Find unique labels and sort them for both graphs
                # Keep for each node the temporary
                L_temp[j] = dict()
                for v in Gs_ed[j].keys():
                    credential = str(L[j][v]) + "," + \
                        str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                    L_temp[j][v] = credential
                    if credential not in self._inv_labels[i]:
                        new_previous_label_set.add((credential, L[j][v]))

            # Calculate the new label_set
            WL_labels_inverse = dict()
            if len(new_previous_label_set) > 0:
                for dv, previous_label in sorted(list(new_previous_label_set), key=lambda tup: tup[0]):
                    WL_labels_inverse[dv] = label_count
                    self._insert_into_hierarchy(label_count, previous_label)
                    label_count += 1

            # Recalculate labels
            for j in range(nx):
                new_labels = dict()
                for (k, v) in iteritems(L_temp[j]):
                    if v in self._inv_labels[i]:
                        new_labels[k] = self._inv_labels[i][v]
                    else:
                        new_labels[k] = WL_labels_inverse[v]
                L[j] = new_labels

        # Compute the vector representation of each graph
        if self.sparse:
            Hs = lil_matrix((nx, len(self._hierarchy)))
        else:
            Hs = np.zeros((nx, len(self._hierarchy)))
        for j in range(nx):
            for k in L[j].keys():
                current_label = L[j][k]
                while self._hierarchy[current_label]['parent'] is not None:
                    Hs[j, current_label] += self._hierarchy[current_label]['omega']
                    current_label = self._hierarchy[current_label]['parent']

        self.Y = Hs

        # Compute the histogram intersection kernel
        K = np.zeros((nx, self._nx))
        if self.sparse:
            for i in range(self._nx):
                for j in range(i, self._nx):
                    K[i, j] = np.sum(Hs[i, :self.X.shape[1]].minimum(self.X[j, :]))
        else:
            for i in range(nx):
                for j in range(self._nx):
                    K[i, j] = np.sum(np.min([Hs[i, :self.X.shape[1]], self.X[j, :]], axis=0))

        self._is_transformed = True
        if self.normalize:
            X_diag, Y_diag = self.diagonal()
            old_settings = np.seterr(divide='ignore')
            K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
            np.seterr(**old_settings)

        return K