Beispiel #1
0
def test_graph_edge_dictionary():
    """Testing Graph object consistency for an edge-dictionary-type initialization object."""
    # Input
    X = {'a': {'a': 1, 'b': 1, 'd': 3},
         'b': {'a': 1, 'd': 2},
         'c': {'a': 2, 'b': 3, 'd': 1},
         'd': {'a': 1}}

    labels = {'a': 'banana', 'b': 'cherry', 'c': 'banana', 'd': 'cherry'}

    # Test for all Graph formats
    g = dict()
    g["auto"] = Graph(X, labels, {}, "auto")
    g["dict"] = Graph(X, labels, {}, "dictionary")
    g["adjc"] = Graph(X, labels, {}, "adjacency")
    g["all"] = Graph(X, labels, {}, "all")

    # Desired output label group
    desired_output_label_group = {'cherry': set(['d', 'b']),
                                  'banana': set(['a', 'c'])}
    desired_output_label_group_idx = {'banana': set([0, 2]),
                                      'cherry': set([1, 3])}

    def proper_dict(x):
        return {key: set(x[key]) for key in x.keys()}

    for k in g.keys():
        gklg = g[k].get_label_group()
        if verbose:
            print(k)
            print(gklg, '\n')
        else:
            if (k == "adjc"):
                npt.assert_equal(
                    desired_output_label_group_idx,
                    proper_dict(gklg))
            else:
                npt.assert_equal(
                    desired_output_label_group,
                    proper_dict(gklg))

    # Desired Shortest path matrix
    spm_do = [[0., 1., float("Inf"), 3.],
              [1., 0., float("Inf"), 2.],
              [2., 3., 0., 1.],
              [1., 2., float("Inf"), 0.]]

    desired_labels = {0: 'banana', 1: 'cherry', 2: 'banana', 3: 'cherry'}

    for k in g.keys():
        spm, spl = g[k].build_shortest_path_matrix(algorithm_type="auto")
        if verbose:
            print(k)
            print(spm, '\n', spl, '\n')
        else:
            npt.assert_array_equal(spm, spm_do)
            npt.assert_equal(spl, desired_labels)
Beispiel #2
0
    def parse_input(self, X):
        """Parse and create features for svm_theta kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The lovasz metrics for the given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], {}, {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                i += 1
                A = x.get_adjacency_matrix()
                dual_coeffs = _calculate_svm_theta_(A)
                out.append(self._calculate_svm_theta_levels_(A, dual_coeffs))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #3
0
    def parse_input(self, X):
        """Parse the given input and raise errors if it is invalid.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        Xp : list
            List of graph type objects.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            Xp = list()
            for (i, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' + 'on index: ' +
                                      str(i) + '..')
                        continue
                    elif len(x) == 1:
                        Xp.append(Graph(x[0], {}, {}, self._graph_format))
                    elif len(x) == 2:
                        Xp.append(Graph(x[0], x[1], {}, self._graph_format))
                    else:
                        Xp.append(Graph(x[0], x[1], x[2], self._graph_format))
                elif type(x) is Graph:
                    Xp.append(x)
                else:
                    raise TypeError('Each element of X must have at least ' +
                                    'one and at most 3 elements.\n')
            if len(Xp) == 0:
                raise ValueError('Parsed input is empty.')
            return Xp
Beispiel #4
0
    def parse_input(self, X):
        """Parse and create features for the propagation kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : tuple
            A tuple corresponding to the calculated bigDAG.

        """
        if not isinstance(X, Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = None
            if self._method_calling == 3:
                out = copy.deepcopy(self.X)
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    elif len(x) >= 2:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must have either ' +
                                    'a graph with labels for node and edge ' +
                                    'or 3 elements consisting of a graph ' +
                                    'type object, labels for vertices and ' +
                                    'labels for edges.')
                out = big_dag_append(make_big_dag(x, self.h_),
                                     out,
                                     merge_features=False)
                i += 1

            if self._method_calling == 1:
                self._nx = i
            elif self._method_calling == 3:
                self._ny = i
            if i == 0:
                raise ValueError
                ('parsed input is empty')
            return out
Beispiel #5
0
    def parse_input(self, X):
        """Parse and create features for the `shortest_path` kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        sp_attr_tup : list
            A list of tuples of shortest path matrices and tehir attributes.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            sp_attr_tup = list()
            ni = 0
            for (i, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: '+str(i))
                        continue
                    else:
                        S, L = Graph(
                            x[0], x[1], {},
                            self._graph_format).build_shortest_path_matrix(
                                self.algorithm_type)
                elif type(x) is Graph:
                    S, L = x.build_shortest_path_matrix(self.algorithm_type)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 2 ' +
                                    'and at most 3 elements\n')

                sp_attr_tup.append((S, L))
                ni += 1

            if ni == 0:
                raise ValueError('parsed input is empty')

            return sp_attr_tup
Beispiel #6
0
def test_graph_adjacency():
    """Testing Graph object consistency for an adjacency-type initialization object."""
    # Input
    X = np.array([[1, 1, 0, 3], [1, 0, 0, 2], [2, 3, 0, 1], [1, 0, 0, 0]])
    labels = {0: 'banana', 1: 'cherry', 2: 'banana', 3: 'cherry'}

    # try all formats
    g = dict()
    g["auto"] = Graph(X, labels, {}, "auto")
    g["dict"] = Graph(X, labels, {}, "dictionary")
    g["adjc"] = Graph(X, labels, {}, "adjacency")
    g["all"] = Graph(X, labels, {}, "all")

    # Desired output label group
    desired_output_label_group = {'cherry': [1, 3], 'banana': [0, 2]}

    for k in g.keys():
        gklg = g[k].get_label_group()
        if verbose:
            print(k)
            print(gklg, '\n')
        else:
            npt.assert_equal(desired_output_label_group, gklg)

    # Desired Shortest path matrix
    spm_do = [[0., 1., float("Inf"), 3.],
              [1., 0., float("Inf"), 2.],
              [2., 3., 0., 1.],
              [1., 2., float("Inf"), 0.]]

    for k in g.keys():
        spm, spl = g[k].build_shortest_path_matrix(algorithm_type="auto")
        if verbose:
            print(k)
            print(spm, '\n', spl, '\n')
        else:
            npt.assert_array_equal(spm, spm_do)
            npt.assert_equal(spl, labels)
Beispiel #7
0
    def parse_input(self, X):
        """Parse and create features for random_walk kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn(
                            f'Empty element on index: {idx}. Created empty graph.'
                        )
                    A = Graph(x[0], {}, {},
                              self._graph_format).get_adjacency_matrix()
                elif type(x) is Graph:
                    A = x.get_adjacency_matrix()
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                i += 1
                out.append(self.add_input_(A))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #8
0
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X'])

        # Input validation and parsing
        if X is None:
            raise ValueError('`transform` input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise TypeError('input must be an iterable\n')

            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    elif len(x) == 1:
                        warnings.warn(
                            'Ignoring empty element on index: '
                            + str(i) + '\nLabels must be provided.')
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                        vertices = list(x.get_vertices(purpose="any"))
                        Labels = x.get_labels(purpose="any")
                elif type(x) is Graph:
                    vertices = list(x.get_vertices(purpose="any"))
                    Labels = x.get_labels(purpose="any")
                else:
                    raise TypeError('each element of X must be either '
                                    'a graph object or a list with at '
                                    'least a graph like object and '
                                    'node labels dict \n')

                # Hash based on the labels of fit
                new_labels = {v: self._labels_hash_dict.get(l, None)
                              for v, l in iteritems(Labels)}

                # Radix sort the other
                g = ((vertices, new_labels) +
                     ({n: x.neighbors(n, purpose="any")
                       for n in vertices},))

                gr = {0: self.NH_(g)}
                for r in range(1, self.R):
                    gr[r] = self.NH_(gr[r-1])

                # save the output for all levels
                out.append(gr)
                i += 1

                if i == 0:
                    raise ValueError('parsed input is empty')

        # Transform - calculate kernel matrix
        # Output is always normalized
        km = self._calculate_kernel_matrix(out)
        self._is_transformed = True
        return km
Beispiel #9
0
    def fit(self, X, y=None):
        """Fit a dataset, for a transformer.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). The train samples.

        y : None
            There is no need of a target in a transformer, yet the pipeline API
            requires this parameter.

        Returns
        -------
        self : object
        Returns self.

        """
        self._method_calling = 1
        self._is_transformed = False
        # Input validation and parsing
        self.initialize()
        if X is None:
            raise ValueError('`fit` input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise TypeError('input must be an iterable\n')

            i = 0
            out = list()
            gs = list()
            self._labels_hash_dict, labels_hash_set = dict(), set()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    elif len(x) == 1:
                        warnings.warn(
                            'Ignoring empty element on index: '
                            + str(i) + '\nLabels must be provided.')
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                        vertices = list(x.get_vertices(purpose="any"))
                        Labels = x.get_labels(purpose="any")
                elif type(x) is Graph:
                    vertices = list(x.get_vertices(purpose="any"))
                    Labels = x.get_labels(purpose="any")
                else:
                    raise TypeError('each element of X must be either '
                                    'a graph object or a list with at '
                                    'least a graph like object and '
                                    'node labels dict \n')

                g = (vertices, Labels,
                     {n: x.neighbors(n, purpose="any") for n in vertices})

                # collect all the labels
                labels_hash_set |= set(itervalues(Labels))
                gs.append(g)
                i += 1

            if i == 0:
                raise ValueError('parsed input is empty')

            # Hash labels
            if len(labels_hash_set) > self._max_number:
                warnings.warn('Number of labels is smaller than'
                              'the biggest possible.. '
                              'Collisions will appear on the '
                              'new labels.')

                # If labels exceed the biggest possible size
                nl, nrl = list(), len(labels_hash_set)
                while nrl > self._max_number:
                    nl += self.random_state_.choice(self._max_number,
                                                    self._max_number,
                                                    replace=False).tolist()
                    nrl -= self._max_number
                if nrl > 0:
                    nl += self.random_state_.choice(self._max_number,
                                                    nrl,
                                                    replace=False).tolist()
                # unify the collisions per element.

            else:
                # else draw n random numbers.
                nl = self.random_state_.choice(self._max_number, len(labels_hash_set),
                                               replace=False).tolist()

            self._labels_hash_dict = dict(zip(labels_hash_set, nl))

            # for all graphs
            for vertices, labels, neighbors in gs:
                new_labels = {v: self._labels_hash_dict[l]
                              for v, l in iteritems(labels)}
                g = (vertices, new_labels, neighbors,)
                gr = {0: self.NH_(g)}
                for r in range(1, self.R):
                    gr[r] = self.NH_(gr[r-1])

                # save the output for all levels
                out.append(gr)

        self.X = out

        # Return the transformer
        return self
Beispiel #10
0
    def parse_input(self, X):
        """Parse and create features for the attributed propation kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        local_values : dict
            A dictionary of pairs between each input graph and a bins where the
            sampled graphlets have fallen.

        """
        if not isinstance(X, collections.Iterable):
            raise ValueError('input must be an iterable\n')
        else:
            # The number of parsed graphs
            n = 0
            transition_matrix = dict()
            indexes = [0]
            Attr = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 2, 3, 4]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on ' +
                                      'index: ' + str(idx))
                        continue
                    if len(x) == 2 and type(x[0]) is Graph:
                        g, T = x
                    else:
                        g = Graph(x[0], x[1], {}, self._graph_format)
                        if len(x) == 4:
                            T = x[3]
                        else:
                            T = None
                elif type(x) is Graph:
                    g, T = x, None
                else:
                    raise ValueError('Each element of X must be either a ' +
                                     'Graph or an iterable with at least 2 ' +
                                     'and at most 4 elements\n')

                if T is not None:
                    if T.shape[0] != T.shape[1]:
                        raise TypeError('Transition matrix on index' + ' ' +
                                        str(idx) + 'must be ' +
                                        'a square matrix.')
                    if T.shape[0] != g.nv():
                        raise TypeError('Propagation matrix must ' +
                                        'have the same dimension ' +
                                        'as the number of vertices.')
                else:
                    T = g.get_adjacency_matrix()

                nv = g.nv()
                transition_matrix[n] = (T.T / np.sum(T, axis=1)).T
                attr = g.get_labels(purpose="adjacency")
                try:
                    attributes = np.array([attr[j] for j in range(nv)])
                except TypeError:
                    raise TypeError(
                        'All attributes of a single graph should have the same dimension.'
                    )

                Attr.append(attributes)
                indexes.append(indexes[-1] + nv)
                n += 1
            try:
                P = np.vstack(Attr)
            except ValueError:
                raise ValueError(
                    'Attribute dimensions should be the same, for all graphs')

            if self._method_calling == 1:
                self._dim = P.shape[1]
            else:
                if self._dim != P.shape[1]:
                    raise ValueError('transform attribute vectors should'
                                     'have the same dimension as in fit')

            if n == 0:
                raise ValueError('Parsed input is empty')

            # feature vectors
            if self._method_calling == 1:
                # simple normal
                self._u, self._b, self._hd = list(), list(), list()
                for t in range(self.t_max):
                    u = self.random_state_.randn(self._dim)
                    if self.take_cauchy_:
                        # cauchy
                        u = np.divide(u, self.random_state_.randn(self._dim))

                    self._u.append(u)
                    # random offset
                    self._b.append(self.w *
                                   self.random_state_.randn(self._dim))

                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t],
                                                self._b[t]).tolist()

                    hd = {
                        j: i
                        for i, j in enumerate({tuple(l)
                                               for l in hashes})
                    }
                    self._hd.append(hd)

                    features = np.array([hd[tuple(l)] for l in hashes])

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(
                            features[indexes[k]:indexes[k + 1]].flat)

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]

            if self._method_calling == 3:
                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t],
                                                self._b[t]).tolist()

                    hd = dict(
                        chain(
                            iteritems(self._hd[t]),
                            iter((j, i)
                                 for i, j in enumerate(
                                     filterfalse(lambda x: x in self._hd[t],
                                                 {tuple(l)
                                                  for l in hashes}),
                                     len(self._hd[t])))))

                    features = np.array([hd[tuple(l)] for l in hashes])

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(features[indexes[k]:indexes[k +
                                                                        1]])

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]
Beispiel #11
0
    def parse_input(self, X):
        """Parse and create features for "shortest path" kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        sp_counts : dict
            A dictionary that for each vertex holds the counts of shortest path
            tuples.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
            # Not a dictionary
        else:
            i = -1
            sp_counts = dict()
            if self._method_calling == 1:
                self._enum = dict()
            elif self._method_calling == 3:
                self._Y_enum = dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or
                                (len(x) == 1 and not self.with_labels) or
                                len(x) in [2, 3]):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    elif len(x) == 1:
                        spm_data = Graph(x[0], {}, {}, self._graph_format
                                         ).build_shortest_path_matrix(self.algorithm_type,
                                                                      labels=self._lt)
                    else:
                        spm_data = Graph(x[0], x[1], {}, self._graph_format
                                         ).build_shortest_path_matrix(self.algorithm_type,
                                                                      labels=self._lt)
                elif type(x) is Graph:
                    spm_data = x.build_shortest_path_matrix(self.algorithm_type, labels=self._lt)
                else:
                    raise TypeError('each element of X must have at least' +
                                    ' one and at most 3 elements\n')
                i += 1

                S, L = self._decompose_input(spm_data)
                sp_counts[i] = dict()
                for u in range(S.shape[0]):
                    for v in range(S.shape[1]):
                        if u == v or S[u, v] == float("Inf"):
                            continue
                        label = self._lhash(S, u, v, *L)
                        if label not in self._enum:
                            if self._method_calling == 1:
                                idx = len(self._enum)
                                self._enum[label] = idx
                            elif self._method_calling == 3:
                                if label not in self._Y_enum:
                                    idx = len(self._enum) + len(self._Y_enum)
                                    self._Y_enum[label] = idx
                                else:
                                    idx = self._Y_enum[label]
                        else:
                            idx = self._enum[label]
                        if idx in sp_counts[i]:
                            sp_counts[i][idx] += 1
                        else:
                            sp_counts[i][idx] = 1

            if i == -1:
                raise ValueError('parsed input is empty')

            if self._method_calling == 1:
                self._nx = i+1
            elif self._method_calling == 3:
                self._ny = i+1
            return sp_counts
Beispiel #12
0
def read_data(name,
              with_classes=True,
              prefer_attr_nodes=False,
              prefer_attr_edges=False,
              produce_labels_nodes=False,
              as_graphs=False,
              is_symmetric=symmetric_dataset):
    """Create a dataset iterable for GraphKernel.

    Parameters
    ----------
    name : str
        The dataset name.

    with_classes : bool, default=False
        Return an iterable of class labels based on the enumeration.

    produce_labels_nodes : bool, default=False
        Produce labels for nodes if not found.
        Currently this means labeling its node by its degree inside the Graph.
        This operation is applied only if node labels are non existent.

    prefer_attr_nodes : bool, default=False
        If a dataset has both *node* labels and *node* attributes
        set as labels for the graph object for *nodes* the attributes.

    prefer_attr_edges : bool, default=False
        If a dataset has both *edge* labels and *edge* attributes
        set as labels for the graph object for *edge* the attributes.

    as_graphs : bool, default=False
        Return data as a list of Graph Objects.

    is_symmetric : bool, default=False
        Defines if the graph data describe a symmetric graph.

    Returns
    -------
    Gs : iterable
        An iterable of graphs consisting of a dictionary, node
        labels and edge labels for each graph.

    classes : np.array, case_of_appearance=with_classes==True
        An one dimensional array of graph classes aligned with the lines
        of the `Gs` iterable. Useful for classification.

    """
    indicator_path = "./" + str(name) + "/" + str(
        name) + "_graph_indicator.txt"
    edges_path = "./" + str(name) + "/" + str(name) + "_A.txt"
    node_labels_path = "./" + str(name) + "/" + str(name) + "_node_labels.txt"
    node_attributes_path = "./" + str(name) + "/" + str(
        name) + "_node_attributes.txt"
    edge_labels_path = "./" + str(name) + "/" + str(name) + "_edge_labels.txt"
    edge_attributes_path = \
        "./" + str(name) + "/" + str(name) + "_edge_attributes.txt"
    graph_classes_path = \
        "./" + str(name) + "/" + str(name) + "_graph_labels.txt"

    # node graph correspondence
    ngc = dict()
    # edge line correspondence
    elc = dict()
    # dictionary that keeps sets of edges
    Graphs = dict()
    # dictionary of labels for nodes
    node_labels = dict()
    # dictionary of labels for edges
    edge_labels = dict()

    # Associate graphs nodes with indexes
    with open(indicator_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            ngc[i] = int(line[:-1])
            if int(line[:-1]) not in Graphs:
                Graphs[int(line[:-1])] = set()
            if int(line[:-1]) not in node_labels:
                node_labels[int(line[:-1])] = dict()
            if int(line[:-1]) not in edge_labels:
                edge_labels[int(line[:-1])] = dict()

    # Extract graph edges
    with open(edges_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            edge = line[:-1].replace(' ', '').split(",")
            elc[i] = (int(edge[0]), int(edge[1]))
            Graphs[ngc[int(edge[0])]].add((int(edge[0]), int(edge[1])))
            if is_symmetric:
                Graphs[ngc[int(edge[1])]].add((int(edge[1]), int(edge[0])))

    # Extract node attributes
    if (prefer_attr_nodes and dataset_metadata[name].get(
            "na", os.path.exists(node_attributes_path))):
        with open(node_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = \
                    [float(num) for num in
                     line[:-1].replace(' ', '').split(",")]
    # Extract node labels
    elif dataset_metadata[name].get("nl", os.path.exists(node_labels_path)):
        with open(node_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = int(line[:-1])
    elif produce_labels_nodes:
        for i in range(1, len(Graphs) + 1):
            node_labels[i] = dict(Counter(s for (s, d) in Graphs[i] if s != d))

    # Extract edge attributes
    if (prefer_attr_edges and dataset_metadata[name].get(
            "ea", os.path.exists(edge_attributes_path))):
        with open(edge_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                attrs = [
                    float(num) for num in line[:-1].replace(' ', '').split(",")
                ]
                edge_labels[ngc[elc[i][0]]][elc[i]] = attrs
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = attrs

    # Extract edge labels
    elif dataset_metadata[name].get("el", os.path.exists(edge_labels_path)):
        with open(edge_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                edge_labels[ngc[elc[i][0]]][elc[i]] = int(line[:-1])
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = \
                        int(line[:-1])

    Gs = list()
    if as_graphs:
        for i in range(1, len(Graphs) + 1):
            Gs.append(Graph(Graphs[i], node_labels[i], edge_labels[i]))
    else:
        for i in range(1, len(Graphs) + 1):
            Gs.append([Graphs[i], node_labels[i], edge_labels[i]])

    if with_classes:
        classes = []
        with open(graph_classes_path, "r") as f:
            for line in f:
                classes.append(int(line[:-1]))

        classes = np.array(classes, dtype=np.int)
        return Bunch(data=Gs, target=classes)
    else:
        return Bunch(data=Gs)
Beispiel #13
0
    def parse_input(self, X):
        """Fast ML Graph Kernel.

        See supplementary material :cite:`kondor2016multiscale`, algorithm 1.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            A list of tuples with S matrices inverses
            and their 4th-root determinants.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ng = 0
            out = list()
            data = dict()
            neighborhoods = dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph or an iterable with at least 1 '
                                    'and at most 3 elements\n')
                phi_d = x.get_labels()
                A = x.get_adjacency_matrix()
                try:
                    phi = np.array([list(phi_d[i]) for i in range(A.shape[0])])
                except TypeError:
                    raise TypeError('Features must be iterable and castable '
                                    'in total to a numpy array.')

                Lap = laplacian(A).astype(float)
                _increment_diagonal_(Lap, self.heta)
                data[ng] = {0: A, 1: phi, 2: inv(Lap)}
                neighborhoods[ng] = x
                ng += 1

            if ng == 0:
                raise ValueError('parsed input is empty')

            # Define a function for calculating the S's of subgraphs of each iteration
            def calculate_C(k, j, l):
                if type(neighborhoods[k]) is Graph:
                    neighborhoods[k] = neighborhoods[k].produce_neighborhoods(
                        r=self.L, sort_neighbors=False)

                indexes = neighborhoods[k][l][j]
                L = laplacian(data[k][0][indexes, :][:, indexes]).astype(float)
                _increment_diagonal_(L, self.heta)
                U = data[k][1][indexes, :]
                S = multi_dot((U.T, inv(L), U))
                _increment_diagonal_(S, self.gamma)

                return (inv(S), np.sum(np.log(np.real(eigvals(S)))))

            if self._method_calling == 1:
                V = [(k, j) for k in range(ng)
                     for j in range(data[k][0].shape[0])]

                ns = min(len(V), self.n_samples)

                self.random_state_.shuffle(V)
                vs = V[:ns]
                phi_k = np.array([data[k][1][j, :] for (k, j) in vs])

                # w the eigen vectors, v the eigenvalues
                K = phi_k.dot(phi_k.T)

                # Calculate eigenvalues
                v, w = eig(K)
                v, w = np.real(v), np.real(w.T)

                # keep only the positive
                vpos = np.argpartition(v, -self.P)[-self.P:]
                vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)]

                # ksi.shape = (k, Ns) * (Ns, P)
                ksi = w[vpos].dot(phi_k).T / np.sqrt(v[vpos])
                for j in range(ng):
                    # (n_samples, k) * (k, P)
                    data[j][1] = data[j][1].dot(ksi)
                self._data_level = {0: ksi}
                for l in range(1, self.L + 1):
                    # Take random samples from all the vertices of all graphs
                    self.random_state_.shuffle(V)
                    vs = V[:ns]

                    # Compute the reference subsampled Gram matrix
                    K_proj = {
                        k: np.zeros(shape=(data[k][0].shape[0], ns))
                        for k in range(ng)
                    }
                    K, C = np.zeros(shape=(len(vs), len(vs))), dict()
                    for (m, (k, j)) in enumerate(vs):
                        C[m] = calculate_C(k, j, l)
                        K_proj[k][j, m] = K[m, m] = self.pairwise_operation(
                            C[m], C[m])
                        for (s, (k2, j2)) in enumerate(vs):
                            if s < m:
                                K[s, m] = K[m, s] \
                                        = K_proj[k2][j2, m] \
                                        = K_proj[k][j, s] \
                                        = self.pairwise_operation(C[s], C[m])
                            else:
                                break

                    # Compute the kernels of the relations of the reference to everything else
                    for (k, j) in V[ns:]:
                        for (m, _) in enumerate(vs):
                            K_proj[k][j, m] = self.pairwise_operation(
                                C[m], calculate_C(k, j, l))

                    # w the eigen vectors, v the eigenvalues
                    v, w = eig(K)
                    v, w = np.real(v), np.real(w.T)

                    # keep only the positive
                    vpos = np.argpartition(v, -self.P)[-self.P:]
                    vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)]

                    # Q shape=(k, P)
                    Q = w[vpos].T / np.sqrt(v[vpos])
                    for j in range(ng):
                        # (n, ns) * (ns, P)
                        data[j][1] = K_proj[j].dot(Q)
                    self._data_level[l] = (C, Q)

            elif self._method_calling == 3:
                ksi = self._data_level[0]
                for j in range(ng):
                    # (n, k) * (k, P)
                    data[j][1] = data[j][1].dot(ksi)

                for l in range(1, self.L + 1):
                    C, Q = self._data_level[l]
                    for j in range(ng):
                        K_proj = np.zeros(shape=(data[j][0].shape[0], len(C)))
                        for n in range(data[j][0].shape[0]):
                            for m in range(len(C)):
                                K_proj[n, m] = self.pairwise_operation(
                                    C[m], calculate_C(j, n, l))
                        data[j][1] = K_proj.dot(Q)

            # Apply the final calculation of S.
            for k in range(ng):
                S = multi_dot((data[k][1].T, data[k][2], data[k][1]))
                _increment_diagonal_(S, self.gamma)
                out.append((inv(S), np.sum(np.log(np.real(eigvals(S))))))

            return out
Beispiel #14
0
def read_data(name,
              path='',
              with_classes=True,
              prefer_attr_nodes=False,
              prefer_attr_edges=False,
              produce_labels_nodes=False,
              as_graphs=False,
              is_symmetric=symmetric_dataset,
              fopen=open):
    """Create a dataset iterable for GraphKernel.

    Parameters
    ----------
    name : str
        The dataset name.

    with_classes : bool, default=False
        Return an iterable of class labels based on the enumeration.

    produce_labels_nodes : bool, default=False
        Produce labels for nodes if not found.
        Currently this means labeling its node by its degree inside the Graph.
        This operation is applied only if node labels are non existent.

    prefer_attr_nodes : bool, default=False
        If a dataset has both *node* labels and *node* attributes
        set as labels for the graph object for *nodes* the attributes.

    prefer_attr_edges : bool, default=False
        If a dataset has both *edge* labels and *edge* attributes
        set as labels for the graph object for *edge* the attributes.

    as_graphs : bool, default=False
        Return data as a list of Graph Objects.

    is_symmetric : bool, default=False
        Defines if the graph data describe a symmetric graph.

    Returns
    -------
    Gs : iterable
        An iterable of graphs consisting of a dictionary, node
        labels and edge labels for each graph.

    classes : np.array, case_of_appearance=with_classes==True
        An one dimensional array of graph classes aligned with the lines
        of the `Gs` iterable. Useful for classification.

    """

    if isinstance(path, zipfile.ZipFile):
        zip_ref = path

        class ZipOpen:
            def __init__(self, *args, **kwargs):
                self.fid = zip_ref.open(*args, **kwargs)

            def __enter__(self):
                self.tio = TextIOWrapper(self.fid)
                return self.tio.__enter__()

            def __exit__(self, exc_type, exc_val, exc_tb):
                return self.tio.__exit__(exc_type, exc_val, exc_tb)

        fopen = ZipOpen
        folder = ''
    else:
        fopen = open
        folder = path

    get_component_path = lambda cmp: os.path.join(folder, f'{name}',
                                                  f'{name}_{cmp}.txt')
    indicator_path = get_component_path("graph_indicator")
    edges_path = get_component_path('A')
    node_labels_path = get_component_path('node_labels')
    node_attributes_path = get_component_path("node_attributes")
    edge_labels_path = get_component_path("edge_labels")
    edge_attributes_path = get_component_path("edge_attributes")
    graph_classes_path = get_component_path("graph_labels")

    # node graph correspondence
    ngc = dict()
    # edge line correspondence
    elc = dict()
    # dictionary that keeps sets of edges
    Graphs = dict()
    # dictionary of labels for nodes
    node_labels = dict()
    # dictionary of labels for edges
    edge_labels = dict()

    # Associate graphs nodes with indexes
    with fopen(indicator_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            ngc[i] = int(line[:-1])
            if int(line[:-1]) not in Graphs:
                Graphs[int(line[:-1])] = set()
            if int(line[:-1]) not in node_labels:
                node_labels[int(line[:-1])] = dict()
            if int(line[:-1]) not in edge_labels:
                edge_labels[int(line[:-1])] = dict()

    # Extract graph edges
    with fopen(edges_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            edge = line[:-1].replace(' ', '').split(",")
            elc[i] = (int(edge[0]), int(edge[1]))
            Graphs[ngc[int(edge[0])]].add((int(edge[0]), int(edge[1])))
            if is_symmetric:
                Graphs[ngc[int(edge[1])]].add((int(edge[1]), int(edge[0])))

    # Extract node attributes
    has_attrs = False
    if prefer_attr_nodes and dataset_metadata[name].get("na", False):
        try:
            with fopen(node_attributes_path, "r") as f:
                for (i, line) in enumerate(f, 1):
                    node_labels[ngc[i]][i] = \
                        [float(num) for num in
                         line[:-1].replace(' ', '').split(",")]
            has_attrs = True
        except KeyError:
            pass
    # Extract node labels
    elif not has_attrs and dataset_metadata[name].get("nl", False):
        try:
            with fopen(node_labels_path, "r") as f:
                for (i, line) in enumerate(f, 1):
                    node_labels[ngc[i]][i] = int(line[:-1])
            has_attrs = True
        except KeyError:
            pass
    elif not has_attrs and produce_labels_nodes:
        for i in range(1, len(Graphs) + 1):
            node_labels[i] = dict(Counter(s for (s, d) in Graphs[i] if s != d))

    # Extract edge attributes
    has_attrs = False
    if prefer_attr_edges and dataset_metadata[name].get("ea", False):
        try:
            with fopen(edge_attributes_path, "r") as f:
                for (i, line) in enumerate(f, 1):
                    attrs = [
                        float(num)
                        for num in line[:-1].replace(' ', '').split(",")
                    ]
                    edge_labels[ngc[elc[i][0]]][elc[i]] = attrs
                    if is_symmetric:
                        edge_labels[ngc[elc[i][1]]][(elc[i][1],
                                                     elc[i][0])] = attrs
            has_attrs = True
        except KeyError:
            pass
    # Extract edge labels
    elif not has_attrs and dataset_metadata[name].get("el", False):
        try:
            with fopen(edge_labels_path, "r") as f:
                for (i, line) in enumerate(f, 1):
                    edge_labels[ngc[elc[i][0]]][elc[i]] = int(line[:-1])
                    if is_symmetric:
                        edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = \
                            int(line[:-1])
            has_attrs = True
        except KeyError:
            pass
    Gs = list()
    if as_graphs:
        for i in range(1, len(Graphs) + 1):
            Gs.append(Graph(Graphs[i], node_labels[i], edge_labels[i]))
    else:
        for i in range(1, len(Graphs) + 1):
            Gs.append([Graphs[i], node_labels[i], edge_labels[i]])

    if with_classes:
        classes = []
        with fopen(graph_classes_path, "r") as f:
            for line in f:
                classes.append(int(line[:-1]))

        classes = np.array(classes, dtype=np.int)
        return Bunch(data=Gs, target=classes)
    else:
        return Bunch(data=Gs)
Beispiel #15
0
    def parse_input(self, X):
        """Parse input and create features, while initializing and/or calculating sub-kernels.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_graph_kernel : object
            Returns base_graph_kernel. Only if called from `fit` or `fit_transform`.

        K : np.array
            Returns the kernel matrix. Only if called from `transform` or
            `fit_transform`.

        """
        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx, max_core_number, core_numbers, graphs = 0, 0, [], []
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                extra = tuple()
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) >= 0:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    elif len(x) == 1:
                        x = Graph(x[0], {}, {}, graph_format="adjacency")
                    elif len(x) == 2:
                        x = Graph(x[0], x[1], {}, graph_format="adjacency")
                    elif len(x) >= 3:
                        if len(x) > 3:
                            extra += tuple(x[3:])
                        x = Graph(x[0], x[1], x[2], graph_format="adjacency")
                elif type(x) is Graph:
                    x.desired_format("adjacency")
                    x = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency",
                                     label_type="vertex",
                                     return_none=True),
                        x.get_labels(purpose="adjacency",
                                     label_type="edge",
                                     return_none=True))
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node labels '
                                    'dict \n')
                # workaround for leaving a sparse representation for x
                x.change_format(self._graph_format)
                c = core_number(x)
                max_core_number = max(max_core_number, max(c.values()))
                core_numbers.append(c)
                graphs.append((x, extra))

                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        if max_core_number <= self.min_core:
            raise ValueError(
                'The maximum core equals the min_core boundary set in init.')

        # Add the zero iteration element
        if self._method_calling == 2:
            K = np.zeros(shape=(nx, nx))
        elif self._method_calling == 3:
            self._dummy_kernel = dict()
            K = np.zeros(shape=(nx, self._nx))

        # Main
        base_graph_kernel, indexes_list = dict(), dict()
        for i in range(max_core_number, self.min_core, -1):
            subgraphs, indexes = list(), list()
            for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers,
                                                         graphs)):
                vertices = [k for k, v in iteritems(cn) if v >= i]
                if len(vertices) > 0:
                    # Calculate subgraph and store the index of the non-empty vertices
                    sg = g.get_subgraph(vertices)
                    sub_extra = list()
                    indexes.append(idx)
                    if len(extra) > 0:
                        vs = np.array(sg.get_vertices(purpose='any'))
                        for e in extra:
                            # This case will only be reached by now if the user add the propagation
                            # kernel as subkernel with a custom propagation matrix. This is a workaround!
                            if type(e) is np.array and len(e.shape) == 2:
                                e = e[vs, :][:, vs]
                            sub_extra.append(e)
                        subgraphs.append((sg, ) + tuple(sub_extra))
                    else:
                        subgraphs.append(sg)
            indexes = np.array(indexes)
            indexes_list[i] = indexes

            # calculate kernel
            if self._method_calling == 1 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                base_graph_kernel[i].fit(subgraphs)
            elif self._method_calling == 2 and indexes.shape[0] > 0:
                base_graph_kernel[i] = self.base_graph_kernel_(**self.params_)
                ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs)
                for j in range(indexes.shape[0]):
                    K[indexes[j], indexes] += ft_subgraph_mat[j, :]
            elif self._method_calling == 3:
                if self._max_core_number < i or self._fit_indexes[i].shape[
                        0] == 0:
                    if len(indexes) > 0:
                        # add a dummy kernel for calculating the diagonal
                        self._dummy_kernel[i] = self.base_graph_kernel_(
                            **self.params_)
                        self._dummy_kernel[i].fit(subgraphs)
                else:
                    if indexes.shape[0] > 0:
                        subgraph_tmat = self.X[i].transform(subgraphs)
                        for j in range(indexes.shape[0]):
                            K[indexes[j],
                              self._fit_indexes[i]] += subgraph_tmat[j, :]

        if self._method_calling == 1:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return base_graph_kernel
        elif self._method_calling == 2:
            self._nx = nx
            self._max_core_number = max_core_number
            self._fit_indexes = indexes_list
            return K, base_graph_kernel
        elif self._method_calling == 3:
            self._t_nx = nx
            self._max_core_number_trans = max_core_number
            self._transform_indexes = indexes_list
            return K
Beispiel #16
0
    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X', '_nx', '_inv_labels'])

        # Input validation and parsing
        if X is None:
            raise ValueError('transform input cannot be None')
        else:
            if not isinstance(X, collections.Iterable):
                raise ValueError('input must be an iterable\n')
            else:
                nx = 0
                distinct_values = set()
                Gs_ed, L = dict(), dict()
                for (i, x) in enumerate(iter(X)):
                    is_iter = isinstance(x, collections.Iterable)
                    if is_iter:
                        x = list(x)
                    if is_iter and len(x) in [0, 2, 3]:
                        if len(x) == 0:
                            warnings.warn('Ignoring empty element on index: ' +
                                          str(i))
                            continue

                        elif len(x) in [2, 3]:
                            x = Graph(x[0], x[1], {}, self._graph_format)
                    elif type(x) is Graph:
                        x.desired_format("dictionary")
                    else:
                        raise ValueError('each element of X must have at ' +
                                         'least one and at most 3 elements\n')
                    Gs_ed[nx] = x.get_edge_dictionary()
                    L[nx] = x.get_labels(purpose="dictionary")

                    # Hold all the distinct values
                    distinct_values |= set(v for v in itervalues(L[nx])
                                           if v not in self._inv_labels[0])
                    nx += 1
                if nx == 0:
                    raise ValueError('parsed input is empty')

        nl = len(self._inv_labels[0])
        WL_labels_inverse = {
            dv: idx
            for (idx, dv) in enumerate(sorted(list(distinct_values)), nl)
        }

        def generate_graphs(WL_labels_inverse, nl):
            # calculate the kernel matrix for the 0 iteration
            new_graphs = list()
            for j in range(nx):
                new_labels = dict()
                for (k, v) in iteritems(L[j]):
                    if v in self._inv_labels[0]:
                        new_labels[k] = self._inv_labels[0][v]
                    else:
                        new_labels[k] = WL_labels_inverse[v]
                L[j] = new_labels
                # produce the new graphs
                new_graphs.append([Gs_ed[j], new_labels])
            yield new_graphs

            for i in range(1, self._n_iter):
                new_graphs = list()
                L_temp, label_set = dict(), set()
                nl += len(self._inv_labels[i])
                for j in range(nx):
                    # Find unique labels and sort them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                            str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        if credential not in self._inv_labels[i]:
                            label_set.add(credential)

                # Calculate the new label_set
                WL_labels_inverse = dict()
                if len(label_set) > 0:
                    for dv in sorted(list(label_set)):
                        idx = len(WL_labels_inverse) + nl
                        WL_labels_inverse[dv] = idx

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for (k, v) in iteritems(L_temp[j]):
                        if v in self._inv_labels[i]:
                            new_labels[k] = self._inv_labels[i][v]
                        else:
                            new_labels[k] = WL_labels_inverse[v]
                    L[j] = new_labels
                    # Create the new graphs with the new labels.
                    new_graphs.append([Gs_ed[j], new_labels])
                yield new_graphs

        if self._parallel is None:
            # Calculate the kernel matrix without parallelization
            K = np.sum(
                (self.X[i].transform(g)
                 for (i,
                      g) in enumerate(generate_graphs(WL_labels_inverse, nl))),
                axis=0)

        else:
            # Calculate the kernel marix with parallelization
            K = np.sum(self._parallel(
                joblib.delayed(etransform)(self.X[i], g)
                for (i,
                     g) in enumerate(generate_graphs(WL_labels_inverse, nl))),
                       axis=0)

        self._is_transformed = True
        if self.normalize:
            X_diag, Y_diag = self.diagonal()
            old_settings = np.seterr(divide='ignore')
            K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag))))
            np.seterr(**old_settings)

        return K
Beispiel #17
0
    def parse_input(self, X):
        """Parse and create features for lovasz_theta kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.


        Returns
        -------
        out : list
            The lovasz metrics for the given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            adjm = list()
            max_dim = 0
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], {}, {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                i += 1
                A = x.get_adjacency_matrix()
                adjm.append(A)
                max_dim = max(max_dim, A.shape[0])

            if self._method_calling == 1:
                if self.d_ is None:
                    self.d_ = max_dim + 1

            if self.d_ < max_dim + 1:
                if self.max_dim is None and self._method_calling == 3:
                    raise ValueError(
                        'Maximum dimension of a graph in transform is bigger '
                        'than the one found in fit. To avoid that use max_dim parameter.'
                    )
                else:
                    raise ValueError('max_dim should correspond to the '
                                     'biggest graph inside the dataset')

            out = list()
            for A in adjm:
                X, t = _calculate_lovasz_embeddings_(A)
                U = _calculate_lovasz_labelling_(X, t, self.d_)
                out.append(self._calculate_MEC_(U))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #18
0
    def parse_input(self, X):
        """Parse input for weisfeiler lehman.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_kernel : object
        Returns base_kernel.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )

                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                extras[nx] = extra
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx

        # get all the distinct values of current labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = dict()
        self._inv_labels[0] = WL_labels_inverse

        def generate_graphs(label_count, WL_labels_inverse):
            new_graphs = list()
            for j in range(nx):
                new_labels = dict()
                for k in L[j].keys():
                    new_labels[k] = WL_labels_inverse[L[j][k]]
                L[j] = new_labels
                # add new labels
                new_graphs.append((Gs_ed[j], new_labels) + extras[j])
            yield new_graphs

            for i in range(1, self._n_iter):
                label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
                for j in range(nx):
                    # Find unique labels and sort
                    # them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                            str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        label_set.add(credential)

                label_list = sorted(list(label_set))
                for dv in label_list:
                    WL_labels_inverse[dv] = label_count
                    label_count += 1

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for k in L_temp[j].keys():
                        new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                    L[j] = new_labels
                    # relabel
                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
                self._inv_labels[i] = WL_labels_inverse
                yield new_graphs

        base_kernel = {
            i: self._base_kernel(**self._params)
            for i in range(self._n_iter)
        }
        if self._parallel is None:
            if self._method_calling == 1:
                for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse)):
                    base_kernel[i].fit(g)
            elif self._method_calling == 2:
                K = np.sum(
                    (base_kernel[i].fit_transform(g) for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse))),
                    axis=0)

        else:
            if self._method_calling == 1:
                self._parallel(
                    joblib.delayed(efit)(base_kernel[i], g)
                    for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse)))
            elif self._method_calling == 2:
                K = np.sum(self._parallel(
                    joblib.delayed(efit_transform)(base_kernel[i], g)
                    for (i, g) in enumerate(
                        generate_graphs(label_count, WL_labels_inverse))),
                           axis=0)

        if self._method_calling == 1:
            return base_kernel
        elif self._method_calling == 2:
            return K, base_kernel
Beispiel #19
0
    def parse_input(
        self,
        X,
    ):
        """Parse input for weisfeiler lehman.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        return_embedding_only: bool
            Whether to return the embedding of the graphs only, instead of computing the kernel all
            the way to the end.

        Returns
        -------
        base_graph_kernel : object
        Returns base_graph_kernel.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )

                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                extras[nx] = extra
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx
        WL_labels_inverse = OrderedDict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = OrderedDict(
        )  # Inverse dictionary of labels, in term of the *previous layer*
        self._inv_labels[0] = deepcopy(WL_labels_inverse)
        self.feature_dims.append(
            len(WL_labels_inverse))  # Update the zeroth iteration feature dim

        # self._inv_label_node_attr = OrderedDict()  # Inverse dictionary of labels, in term of the *node attribute*
        # self._label_node_attr = OrderedDict()  # Same as above, but with key and value inverted
        # self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(WL_labels_inverse, 0)

        # if self.node_weights is not None:
        #     self._feature_weight = OrderedDict()
        #     # Ensure the order is the same
        #     self._feature_weight[0] = self._compute_feature_weight(self.node_weights, 0, WL_labels_inverse)[1]
        # else:
        #     self._feature_weight = None

        def generate_graphs(label_count, WL_labels_inverse):
            new_graphs = list()
            for j in range(self._nx):
                new_labels = dict()
                for k in L[j].keys():
                    new_labels[k] = WL_labels_inverse[L[j][k]]
                L[j] = new_labels
                # add new labels
                new_graphs.append((Gs_ed[j], new_labels) + extras[j])
            yield new_graphs

            for i in range(1, self._h):
                label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
                for j in range(nx):
                    # Find unique labels and sort
                    # them for both graphs
                    # Keep for each node the temporary
                    L_temp[j] = dict()
                    for v in Gs_ed[j].keys():
                        credential = str(L[j][v]) + "," + \
                                     str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                        L_temp[j][v] = credential
                        label_set.add(credential)

                label_list = sorted(list(label_set))
                for dv in label_list:
                    WL_labels_inverse[dv] = label_count
                    label_count += 1

                # Recalculate labels
                new_graphs = list()
                for j in range(nx):
                    new_labels = dict()
                    for k in L_temp[j].keys():
                        new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                    L[j] = new_labels
                    # relabel
                    new_graphs.append((Gs_ed[j], new_labels) + extras[j])
                self._inv_labels[i] = WL_labels_inverse
                # Compute the translated inverse node label
                # self._label_node_attr[i], self._inv_label_node_attr[i] = self.translate_label(WL_labels_inverse, i, self._label_node_attr[i - 1])
                # self.feature_dims.append(self.feature_dims[-1] + len(self._label_node_attr[i]))
                # Compute the feature weight of the current layer
                # if self.node_weights is not None:
                #     self._feature_weight[i] = self._compute_feature_weight(self.node_weights, i, self._inv_label_node_attr[i])[1]
                # assert len(self._feature_weight[i] == len(WL_labels_inverse))
                yield new_graphs

        # Initialise the base graph kernel.
        base_graph_kernel = {}

        K = []
        for (i, g) in enumerate(generate_graphs(label_count,
                                                WL_labels_inverse)):
            param = self._params
            # if self._feature_weight is not None:
            # print(self._feature_weight)
            # param.update({'mahalanobis_precision': self._feature_weight[i]})
            base_graph_kernel.update({i: self._base_graph_kernel(**param)})
            # if return_embedding_only:
            #     K.append(base_graph_kernel[i].parse_input(
            #         g, label_start_idx=self.feature_dims[i], label_end_idx=self.feature_dims[i + 1]))
            # else:
            if self._method_calling == 1:
                base_graph_kernel[i].fit(g, )
            else:
                K.append(base_graph_kernel[i].fit_transform(g, ))

        # if return_embedding_only:
        #     return K
        if self._method_calling == 1:
            return base_graph_kernel
        elif self._method_calling == 2:
            # if self.as_tensor:
            #     K = torch.stack(K, dim=0).sum(dim=0)
            #     return K, base_graph_kernel
            return np.sum(K, axis=0), base_graph_kernel
Beispiel #20
0
    def parse_input(self, X):
        """Parse and create features for the NSPD kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        M : dict
            A dictionary with keys all the distances from 0 to self.d
            and values the the np.arrays with rows corresponding to the
            non-null input graphs and columns to the enumerations of tuples
            consisting of pairs of hash values and radius, from all the given
            graphs of the input (plus the fitted one's on transform).

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            # Hold the number of graphs
            ng = 0

            # Holds all the data for combinations of r, d
            data = collections.defaultdict(dict)

            # Index all keys for combinations of r, d
            all_keys = collections.defaultdict(dict)
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        g = Graph(x[0], x[1], x[2])
                        g.change_format("adjacency")
                elif type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency", label_type="vertex"),
                        x.get_labels(purpose="adjacency", label_type="edge"))
                else:
                    raise TypeError('each element of X must have either ' +
                                    'a graph with labels for node and edge ' +
                                    'or 3 elements consisting of a graph ' +
                                    'type object, labels for vertices and ' +
                                    'labels for edges.')

                # Bring to the desired format
                g.change_format(self._graph_format)

                # Take the vertices
                vertices = set(g.get_vertices(purpose=self._graph_format))

                # Extract the dicitionary
                ed = g.get_edge_dictionary()

                # Convert edges to tuples
                edges = {(j, k) for j in ed.keys() for k in ed[j].keys()}

                # Extract labels for nodes
                Lv = g.get_labels(purpose=self._graph_format)
                # and for edges
                Le = g.get_labels(purpose=self._graph_format,
                                  label_type="edge")

                # Produce all the neighborhoods and the distance pairs
                # up to the desired radius and maximum distance
                N, D, D_pair = g.produce_neighborhoods(self.r,
                                                       purpose="dictionary",
                                                       with_distances=True,
                                                       d=self.d)

                # Hash all the neighborhoods
                H = self._hash_neighborhoods(vertices, edges, Lv, Le, N,
                                             D_pair)

                if self._method_calling == 1:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            for r in range(self.r + 1):
                                key = (H[r, A], H[r, B])
                                keys = all_keys[r, d]
                                idx = keys.get(key, None)
                                if idx is None:
                                    idx = len(keys)
                                    keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1

                elif self._method_calling == 3:
                    for d in filterfalse(lambda x: x not in D,
                                         range(self.d + 1)):
                        for (A, B) in D[d]:
                            # Based on the edges of the bidirected graph
                            for r in range(self.r + 1):
                                keys = all_keys[r, d]
                                fit_keys = self._fit_keys[r, d]
                                key = (H[r, A], H[r, B])
                                idx = fit_keys.get(key, None)
                                if idx is None:
                                    idx = keys.get(key, None)
                                    if idx is None:
                                        idx = len(keys) + len(fit_keys)
                                        keys[key] = idx
                                data[r, d][ng, idx] = data[r, d].get(
                                    (ng, idx), 0) + 1
                ng += 1
            if ng == 0:
                raise ValueError('parsed input is empty')

            if self._method_calling == 1:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix((data, (rows, cols)),
                                        shape=(ng, len(all_keys[key])),
                                        dtype=np.int64)
                self._fit_keys = all_keys
                self._ngx = ng

            elif self._method_calling == 3:
                # A feature matrix for all levels
                M = dict()

                for (key, d) in filterfalse(lambda a: len(a[1]) == 0,
                                            iteritems(data)):
                    indexes, data = zip(*iteritems(d))
                    rows, cols = zip(*indexes)
                    M[key] = csr_matrix(
                        (data, (rows, cols)),
                        shape=(ng,
                               len(all_keys[key]) + len(self._fit_keys[key])),
                        dtype=np.int64)

                self._ngy = ng

            return M
Beispiel #21
0
    def parse_input(self, X):
        """Parse and check the given input for the Graph Hopper kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format).

        Returns
        -------
        out : np.array, shape=(len(X), n_labels)
            A np array for frequency (cols) histograms for all Graphs (rows).

        """
        if not isinstance(X, Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ni = 0
            diam = list()
            graphs = list()
            for (i, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, Iterable):
                    is_iter = True
                    x = list(x)

                if type(x) is Graph:
                    g = Graph(x.get_adjacency_matrix(),
                              x.get_labels(purpose="adjacency"), {},
                              self._graph_format)
                elif is_iter and len(x) == 0 or len(x) >= 2:
                    if len(x) == 0:
                        warn('Ignoring empty element on index: ' + str(i))
                        continue
                    elif len(x) >= 2:
                        g = Graph(x[0], x[1], {}, "adjacency")
                        g.change_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a '
                                    'graph object or a list with at least '
                                    'a graph like object and node, ')

                spm, attr = g.build_shortest_path_matrix(labels="vertex")
                nv = g.nv()
                try:
                    attributes = np.array([attr[j] for j in range(nv)])
                except TypeError:
                    raise TypeError(
                        'All attributes of a single graph should have the same dimension.'
                    )
                diam.append(int(np.max(spm[spm < float("Inf")])))
                graphs.append((g.get_adjacency_matrix(), nv, attributes))
                ni += 1

        if self._method_calling == 1:
            max_diam = self._max_diam = max(diam) + 1
        else:
            max_diam = max(self._max_diam, max(diam) + 1)

        out = list()
        for i in range(ni):
            AM, node_nr, attributes = graphs[i]
            des = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int)
            occ = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int)

            # Convert adjacency matrix to dictionary
            idx_i, idx_j = np.where(AM > 0)
            ed = defaultdict(dict)
            for (a, b) in filterfalse(lambda a: a[0] == a[1],
                                      zip(idx_i, idx_j)):
                ed[a][b] = AM[a, b]

            for j in range(node_nr):
                A = np.zeros(shape=AM.shape)

                # Single-source shortest path from node j
                D, p = dijkstra(ed, j)

                D = np.array(
                    list(D.get(k, float("Inf")) for k in range(node_nr)))
                p[j] = -1

                # Restrict to the connected component of node j
                conn_comp = np.where(D < float("Inf"))[0]

                # To-be DAG adjacency matrix of connected component of node j
                A_cc = A[conn_comp, :][:, conn_comp]

                # Adjacency matrix of connected component of node j
                AM_cc = AM[conn_comp, :][:, conn_comp]
                D_cc = D[conn_comp]
                conn_comp_converter = np.zeros(shape=(A.shape[0], 1),
                                               dtype=int)
                for k in range(conn_comp.shape[0]):
                    conn_comp_converter[conn_comp[k]] = k
                conn_comp_converter = np.vstack([0, conn_comp_converter])
                p_cc = conn_comp_converter[
                    np.array(list(p[k] for k in conn_comp)) + 1]

                # Number of nodes in connected component of node j
                conncomp_node_nr = A_cc.shape[0]
                for v in range(conncomp_node_nr):
                    if p_cc[v] > 0:
                        # Generate A_cc by adding directed edges of form (parent(v), v)
                        A_cc[p_cc[v], v] = 1

                    # Distance from v to j
                    v_dist = D_cc[v]

                    # All neighbors of v in the undirected graph
                    v_nbs = np.where(AM_cc[v, :] > 0)[0]

                    # Distances of neighbors of v to j
                    v_nbs_dists = D_cc[v_nbs]

                    # All neighbors of v in undirected graph who are
                    # one step closer to j than v is; i.e. SP-DAG parents
                    v_parents = v_nbs[v_nbs_dists == (v_dist - 1)]

                    # Add SP-DAG parents to A_cc
                    A_cc[v_parents, v] = 1

                # Computes the descendants & occurence vectors o_j(v), d_j(v)
                # for all v in the connected component
                occ_p, des_p = od_vectors_dag(A_cc, D_cc)

                if des_p.shape[0] == 1 and j == 0:
                    des[j, 0, 0] = des_p
                    occ[j, 0, 0] = occ_p
                else:
                    # Convert back to the indices of the original graph
                    for v in range(des_p.shape[0]):
                        for l in range(des_p.shape[1]):
                            des[j, conn_comp[v], l] = des_p[v, l]
                    # Convert back to the indices of the original graph
                    for v in range(occ_p.shape[0]):
                        for l in range(occ_p.shape[1]):
                            occ[j, conn_comp[v], l] = occ_p[v, l]

            M = np.zeros(shape=(node_nr, max_diam, max_diam))
            # j loops through choices of root
            for j in range(node_nr):
                des_mat_j_root = np.squeeze(des[j, :, :])
                occ_mat_j_root = np.squeeze(occ[j, :, :])
                # v loops through nodes
                for v in range(node_nr):
                    for a in range(max_diam):
                        for b in range(a, max_diam):
                            # M[v,:,:] is M[v]; a = node coordinate in path, b = path length
                            M[v, a,
                              b] += des_mat_j_root[v, b -
                                                   a] * occ_mat_j_root[v, a]

            if self.calculate_norm_:
                out.append((M, attributes, np.sum(attributes**2, axis=1)))
            else:
                out.append((M, attributes))
        return out
Beispiel #22
0
    def parse_input(self, X):
        """Parse input and create features, while initializing and/or calculating sub-kernels.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        base_graph_kernel : object
            Returns base_graph_kernel. Only if called from `fit` or `fit_transform`.

        K : np.array
            Returns the kernel matrix. Only if called from `transform` or
            `fit_transform`.

        """
        if self.base_graph_kernel_ is None:
            raise ValueError('User must provide a base_graph_kernel')
        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx, labels = 0, list()
            if self._method_calling in [1, 2]:
                nl, labels_enum, base_graph_kernel = 0, dict(), dict()
                for kidx in range(self.n_iter):
                    base_graph_kernel[kidx] = self.base_graph_kernel_[0](
                        **self.base_graph_kernel_[1])
            elif self._method_calling == 3:
                nl, labels_enum, base_graph_kernel = len(
                    self._labels_enum), dict(self._labels_enum), self.X
            inp = list()
            neighbors = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    x, is_iter = list(x), True
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0],
                                      x[1],
                                      x[2],
                                      graph_format=self._graph_format)
                            extra = (x.get_labels(purpose='any',
                                                  label_type="edge",
                                                  return_none=True), ) + extra
                        else:
                            x = Graph(x[0],
                                      x[1], {},
                                      graph_format=self._graph_format)
                            extra = tuple()
                elif type(x) is Graph:
                    el = x.get_labels(purpose=self._graph_format,
                                      label_type="edge",
                                      return_none=True)
                    if el is None:
                        extra = tuple()
                    else:
                        extra = (el, )
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')

                label = x.get_labels(purpose='any')
                inp.append((x.get_graph_object(), extra))
                neighbors.append(x.get_edge_dictionary())
                labels.append(label)
                for v in set(itervalues(label)):
                    if v not in labels_enum:
                        labels_enum[v] = nl
                        nl += 1
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Calculate the hadamard matrix
        H = hadamard(int(2**(ceil(log2(nl)))))

        def generate_graphs(labels):
            # Intial labeling of vertices based on their corresponding Hadamard code (i-th row of the
            # Hadamard matrix) where i is the i-th label on enumeration
            new_graphs, new_labels = list(), list()
            for ((obj, extra), label) in zip(inp, labels):
                new_label = dict()
                for (k, v) in iteritems(label):
                    new_label[k] = H[labels_enum[v], :]
                new_graphs.append(
                    (obj, {i: tuple(j)
                           for (i, j) in iteritems(new_label)}) + extra)
                new_labels.append(new_label)

            yield new_graphs
            # Main
            for i in range(1, self.n_iter):
                new_graphs, labels, new_labels = list(), new_labels, list()
                for ((obj, extra), neighbor,
                     old_label) in zip(inp, neighbors, labels):
                    # Find unique labels and sort them for both graphs and keep for each node
                    # the temporary
                    new_label = dict()
                    for (k, ns) in iteritems(neighbor):
                        new_label[k] = old_label[k]
                        for q in ns:
                            new_label[k] = np.add(new_label[k], old_label[q])
                    new_labels.append(new_label)
                    new_graphs.append(
                        (obj, {i: tuple(j)
                               for (i, j) in iteritems(new_label)}) + extra)
                yield new_graphs

        if self._method_calling in [1, 2]:
            base_graph_kernel = {
                i: self.base_graph_kernel_[0](**self.base_graph_kernel_[1])
                for i in range(self.n_iter)
            }

        if self._parallel is None:
            # Add the zero iteration element
            if self._method_calling == 1:
                for (i, g) in enumerate(generate_graphs(labels)):
                    base_graph_kernel[i].fit(g)
            elif self._method_calling == 2:
                K = np.sum((base_graph_kernel[i].fit_transform(g)
                            for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)
            elif self._method_calling == 3:
                # Calculate the kernel matrix without parallelization
                K = np.sum((self.X[i].transform(g)
                            for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)

        else:
            if self._method_calling == 1:
                self._parallel(
                    joblib.delayed(efit)(base_graph_kernel[i], g)
                    for (i, g) in enumerate(generate_graphs(labels)))
            elif self._method_calling == 2:
                # Calculate the kernel marix with parallelization
                K = np.sum(self._parallel(
                    joblib.delayed(efit_transform)(base_graph_kernel[i], g)
                    for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)
            elif self._method_calling == 3:
                # Calculate the kernel marix with parallelization
                K = np.sum(self._parallel(
                    joblib.delayed(etransform)(self.X[i], g)
                    for (i, g) in enumerate(generate_graphs(labels))),
                           axis=0)

        if self._method_calling == 1:
            self._labels_enum = labels_enum
            return base_graph_kernel
        elif self._method_calling == 2:
            self._labels_enum = labels_enum
            return K, base_graph_kernel
        elif self._method_calling == 3:
            return K
Beispiel #23
0
    def parse_input(self, X):
        """Parse and create features for graphlet_sampling kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        local_values : dict
            A dictionary of pairs between each input graph and a bins where the
            sampled graphlets have fallen.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = -1
            if self._method_calling == 1:
                self._graph_bins = dict()
            elif self._method_calling == 3:
                self._Y_graph_bins = dict()
            local_values = dict()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter = True
                    x = list(x)
                if type(x) is Graph:
                    A = x.get_adjacency_matrix()
                elif is_iter and len(x) in [0, 1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on ' +
                                      'index: ' + str(idx))
                        continue
                    else:
                        A = Graph(x[0], {}, {},
                                  self._graph_format).get_adjacency_matrix()
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                A = (A > 0).astype(int)
                i += 1
                # sample graphlets based on the initialized method
                samples = self.sample_graphlets_(A, self.k_, self.n_samples_,
                                                 self.random_state_)

                if self._method_calling == 1:
                    for (j, sg) in enumerate(samples):
                        # add the graph to an isomorphism class
                        if len(self._graph_bins) == 0:
                            self._graph_bins[0] = sg
                            local_values[(i, 0)] = 1
                        else:
                            newbin = True
                            for k in range(len(self._graph_bins)):
                                if self._graph_bins[k].isomorphic(sg):
                                    newbin = False
                                    if (i, k) not in local_values:
                                        local_values[(i, k)] = 1
                                    local_values[(i, k)] += 1
                                    break
                            if newbin:
                                local_values[(i, len(self._graph_bins))] = 1
                                self._graph_bins[len(self._graph_bins)] = sg
                elif self._method_calling == 3:
                    for (j, sg) in enumerate(samples):
                        # add the graph to an isomorphism class
                        newbin = True
                        for k in range(len(self._graph_bins)):
                            if self._graph_bins[k].isomorphic(sg):
                                newbin = False
                                if (i, k) not in local_values:
                                    local_values[(i, k)] = 1
                                local_values[(i, k)] += 1
                                break
                        if newbin:
                            if len(self._Y_graph_bins) == 0:
                                self._Y_graph_bins[0] = sg
                                local_values[(i, len(self._graph_bins))] = 1
                            else:
                                newbin_Y = True
                                start = len(self._graph_bins)
                                start_Y = len(self._Y_graph_bins)
                                for l in range(start_Y):
                                    if self._Y_graph_bins[l].isomorphic(sg):
                                        newbin_Y = False
                                        bin_key = (i, l + start)
                                        if bin_key not in local_values:
                                            local_values[bin_key] = 1
                                        local_values[bin_key] += 1
                                        break
                                if newbin_Y:
                                    idx = start + start_Y
                                    local_values[(i, idx)] = 1
                                    self._Y_graph_bins[start_Y] = sg

            if i == -1:
                raise ValueError('parsed input is empty')

            if self._method_calling == 1:
                self._nx = i + 1
            elif self._method_calling == 3:
                self._ny = i + 1
            return local_values
Beispiel #24
0
    def parse_input(self, X):
        """Parse and create features for pyramid_match kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        H : list
            A list of lists of Histograms for all levels for each graph.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            Us = []
            if self.with_labels:
                Ls = []
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or
                                (len(x) >= 1 and not self.with_labels) or
                                (len(x) >= 2 and self.with_labels)):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: ' +
                                      str(idx))
                        continue
                    elif not self.with_labels:
                        x = Graph(x[0], {}, {}, self._graph_format)
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif not type(x) is Graph:
                    raise TypeError(
                        'each element of X must be either a graph object or a list with '
                        'at least a graph like object and node labels dict \n')
                A = x.get_adjacency_matrix()
                if self.with_labels:
                    L = x.get_labels(purpose="adjacency")
                i += 1
                if A.shape[0] == 0:
                    Us.append(np.zeros((1, self.d)))
                else:
                    # Perform eigenvalue decomposition.
                    # Rows of matrix U correspond to vertex representations
                    # Embed vertices into the d-dimensional space
                    if A.shape[0] > self.d + 1:
                        # If size of graph smaller than d, pad with zeros
                        Lambda, U = eigs(csr_matrix(A, dtype=np.float),
                                         k=self.d,
                                         ncv=10 * self.d)
                        idx = Lambda.argsort()[::-1]
                        U = U[:, idx]
                    else:
                        Lambda, U = np.linalg.eig(A)
                        idx = Lambda.argsort()[::-1]
                        U = U[:, idx]
                        U = U[:, :self.d]
                    # Replace all components by their absolute values
                    U = np.absolute(U)
                    Us.append((A.shape[0], U))
                if self.with_labels:
                    Ls.append(L)

        if i == 0:
            raise ValueError('parsed input is empty')

        if self.with_labels:
            # Map labels to values between 0 and |L|-1
            # where |L| is the number of distinct labels
            if self._method_calling in [1, 2]:
                self._num_labels = 0
                self._labels = set()
                for L in Ls:
                    self._labels |= set(itervalues(L))
                self._num_labels = len(self._labels)
                self._labels = {l: i for (i, l) in enumerate(self._labels)}
                return self._histogram_calculation(Us, Ls, self._labels)

            elif self._method_calling == 3:
                labels = set()
                for L in Ls:
                    labels |= set(itervalues(L))
                rest_labels = labels - set(self._labels.keys())
                nouveau_labels = dict(
                    chain(iteritems(self._labels), ((j, i) for (
                        i, j) in enumerate(rest_labels, len(self._labels)))))
                return self._histogram_calculation(Us, Ls, nouveau_labels)
        else:
            return self._histogram_calculation(Us)
Beispiel #25
0
    def parse_input(self, X):
        """Parse and create features for graphlet_sampling kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            proc = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [1, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 2 ' +
                                    'and at most 3 elements\n')
                i += 1
                x.desired_format("adjacency")
                Ax = x.get_adjacency_matrix()
                Lx = x.get_labels(purpose="adjacency")
                Lx = [Lx[idx] for idx in range(Ax.shape[0])]
                proc.append((Ax, Lx, Ax.shape[0]))

            out = list()
            for Ax, Lx, s in proc:
                amss = dict()
                labels = set(Lx)
                Lx = np.array(Lx)
                for t in product(labels, labels):
                    selector = np.matmul(np.expand_dims(Lx == t[0], axis=1),
                                         np.expand_dims(Lx == t[1], axis=0))
                    amss[t] = Ax * selector
                out.append((amss, s))

            if i == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #26
0
    def parse_input(self, X):
        """Parse and create features for the propation kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        local_values : dict
            A dictionary of pairs between each input graph and a bins where the
            sampled graphlets have fallen.

        """
        if not isinstance(X, collections.Iterable):
            raise ValueError('input must be an iterable\n')
        else:
            i = -1
            transition_matrix = dict()
            labels = set()
            L = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and len(x) in [0, 2, 3, 4]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on ' +
                                      'index: ' + str(idx))
                        continue
                    if len(x) == 2 and type(x[0]) is Graph:
                        g, T = x
                    else:
                        g = Graph(x[0], x[1], {}, self._graph_format)
                        if len(x) == 4:
                            T = x[3]
                        else:
                            T = None
                elif type(x) is Graph:
                    g, T = x, None
                else:
                    raise ValueError('Each element of X must be either a ' +
                                     'Graph or an iterable with at least 2 ' +
                                     'and at most 4 elements\n')

                if T is not None:
                    if T.shape[0] != T.shape[1]:
                        raise TypeError('Transition matrix on index' + ' ' +
                                        str(idx) + 'must be ' +
                                        'a square matrix.')
                    if T.shape[0] != g.nv():
                        raise TypeError('Propagation matrix must ' +
                                        'have the same dimension ' +
                                        'as the number of vertices.')
                else:
                    T = g.get_adjacency_matrix()

                i += 1
                transition_matrix[i] = (T.T / np.sum(T, axis=1)).T
                label = g.get_labels(purpose='adjacency')
                try:
                    labels |= set(itervalues(label))
                except TypeError:
                    raise TypeError(
                        'For a non attributed kernel, labels should be hashable.'
                    )
                L.append((g.nv(), label))

            if i == -1:
                raise ValueError('Parsed input is empty')

            # The number of parsed graphs
            n = i + 1

            # enumerate labels
            if self._method_calling == 1:
                enum_labels = {l: i for (i, l) in enumerate(list(labels))}
                self._enum_labels = enum_labels
                self._parent_labels = labels
            elif self._method_calling == 3:
                new_elements = labels - self._parent_labels
                if len(new_elements) > 0:
                    new_enum_labels = iter((l, i) for (i, l) in enumerate(
                        list(new_elements), len(self._enum_labels)))
                    enum_labels = dict(
                        chain(iteritems(self._enum_labels), new_enum_labels))
                else:
                    enum_labels = self._enum_labels

            # make a matrix for all graphs that contains label vectors
            P, data, indexes = dict(), list(), [0]
            for (k, (nv, label)) in enumerate(L):
                data += [(indexes[-1] + j, enum_labels[label[j]])
                         for j in range(nv)]
                indexes.append(indexes[-1] + nv)

            # Initialise the on hot vector
            rows, cols = zip(*data)
            P = np.zeros(shape=(indexes[-1], len(enum_labels)))
            P[rows, cols] = 1
            dim_orig = len(self._enum_labels)

            # feature vectors
            if self._method_calling == 1:
                # simple normal
                self._u, self._b, self._hd = list(), list(), list()
                for t in range(self.t_max):
                    u = self.random_state_.randn(len(enum_labels))

                    if self.take_cauchy_:
                        # cauchy
                        u = np.divide(
                            u, self.random_state_.randn(len(enum_labels)))

                    self._u.append(u)
                    # random offset
                    self._b.append(self.w * self.random_state_.rand())

                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t], self._b[t])
                    hd = dict(
                        (j, i) for i, j in enumerate(set(np.unique(hashes))))
                    self._hd.append(hd)
                    features = np.vectorize(lambda i: hd[i])(hashes)

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(features[indexes[k]:indexes[k +
                                                                        1]])

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]

            elif (self._method_calling == 3 and dim_orig >= len(enum_labels)):
                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # for hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P, self._u[t], self._b[t])
                    hd = dict(
                        chain(
                            iteritems(self._hd[t]),
                            iter((j, i)
                                 for i, j in enumerate(
                                     filterfalse(lambda x: x in self._hd[t],
                                                 np.unique(hashes)),
                                     len(self._hd[t])))))

                    features = np.vectorize(lambda i: hd[i])(hashes)

                    # Accumulate the results.
                    for k in range(n):
                        phi[k][t] = Counter(features[indexes[k]:indexes[k +
                                                                        1]])

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                return [phi[k] for k in range(n)]

            else:
                cols = np.array(cols)
                vertices = np.where(cols < dim_orig)[0]
                vertices_p = np.where(cols >= dim_orig)[0]
                nnv = len(enum_labels) - dim_orig
                phi = {k: dict() for k in range(n)}
                for t in range(self.t_max):
                    # hash all graphs inside P and produce the feature vectors
                    hashes = self.calculate_LSH(P[vertices, :dim_orig],
                                                self._u[t], self._b[t])

                    hd = dict(
                        chain(
                            iteritems(self._hd[t]),
                            iter((j, i)
                                 for i, j in enumerate(
                                     filterfalse(lambda x: x in self._hd[t],
                                                 np.unique(hashes)),
                                     len(self._hd[t])))))

                    features = np.vectorize(lambda i: hd[i],
                                            otypes=[int])(hashes)

                    # for each the new labels graph hash P and produce the feature vectors
                    u = self.random_state_.randn(nnv)
                    if self.take_cauchy_:
                        # cauchy
                        u = np.divide(u, self.random_state_.randn(nnv))

                    u = np.hstack((self._u[t], u))

                    # calculate hashes for the remaining
                    hashes = self.calculate_LSH(P[vertices_p, :], u,
                                                self._b[t])
                    hd = dict(
                        chain(
                            iteritems(hd),
                            iter((j, i)
                                 for i, j in enumerate(hashes, len(hd)))))

                    features_p = np.vectorize(lambda i: hd[i],
                                              otypes=[int])(hashes)

                    # Accumulate the results
                    for k in range(n):
                        A = Counter(features[np.logical_and(
                            indexes[k] <= vertices,
                            vertices <= indexes[k + 1])])
                        B = Counter(features_p[np.logical_and(
                            indexes[k] <= vertices_p,
                            vertices_p <= indexes[k + 1])])
                        phi[k][t] = A + B

                    # calculate the Propagation matrix if needed
                    if t < self.t_max - 1:
                        for k in range(n):
                            start, end = indexes[k:k + 2]
                            P[start:end, :] = np.dot(transition_matrix[k],
                                                     P[start:end, :])

                        Q = np.all(P[:, dim_orig:] > 0, axis=1)
                        vertices = np.where(~Q)[0]
                        vertices_p = np.where(Q)[0]

                return [phi[k] for k in range(n)]
    def parse_input(self, X):
        """Parse and create features for the `subgraph_matching` kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            The extracted adjacency matrices for any given input.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = list()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter = True
                    x = list(x)

                if type(x) is Graph:
                    g = Graph(
                        x.get_adjacency_matrix(),
                        x.get_labels(purpose="adjacency"),
                        x.get_labels(purpose="adjacency", label_type="edge"),
                        self._graph_format)
                elif is_iter and len(x) in [0, 3]:
                    x = list(x)
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: ' + str(idx))
                        continue
                    elif len(x) == 3:
                        g = Graph(x[0], x[1], x[2], "adjacency")
                        g.change_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node, ' +
                                    'edge labels dict \n')
                n = g.nv()
                E = g.get_edge_dictionary()
                L = g.get_labels(purpose="dictionary",
                                 return_none=(self.kv is None))
                Le = g.get_labels(purpose="dictionary",
                                  label_type="edge",
                                  return_none=(self.ke is None))
                Er = set(
                    (a, b) for a in E.keys() for b in E[a].keys() if a != b)

                i += 1
                out.append((n, Er, L, Le))

            if i == 0:
                raise ValueError('parsed input is empty')
            return out
    def parse_input(self, X):
        """Parse and create features for multiscale_laplacian kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : list
            Tuples consisting of the Adjacency matrix, phi, phi_outer
            dictionary of neihborhood indexes and inverse laplacians
            up to level self.L and the inverse Laplacian of A.

        """
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            ng = 0
            out = list()
            start = time.time()
            for (idx, x) in enumerate(iter(X)):
                is_iter = False
                if isinstance(x, collections.Iterable):
                    is_iter, x = True, list(x)
                if is_iter and len(x) in [0, 2, 3]:
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element ' +
                                      'on index: ' + str(idx))
                        continue
                    else:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph or an iterable with at least 1 ' +
                                    'and at most 3 elements\n')
                ng += 1
                phi_d = x.get_labels()
                A = x.get_adjacency_matrix()
                N = x.produce_neighborhoods(r=self.L, sort_neighbors=False)
                try:
                    phi = np.array([list(phi_d[i]) for i in range(A.shape[0])])
                except TypeError:
                    raise TypeError('Features must be iterable and castable ' +
                                    'in total to a numpy array.')
                phi_outer = np.dot(phi, phi.T)

                Lap = laplacian(A).astype(float)
                _increment_diagonal_(Lap, self.heta)
                L = inv(Lap)

                Q = dict()
                for level in range(1, self.L + 1):
                    Q[level] = dict()
                    for (key, item) in iteritems(N[level]):
                        Q[level][key] = dict()
                        Q[level][key]["n"] = np.array(item)
                        if len(item) < A.shape[0]:
                            laplac = laplacian(A[item, :][:,
                                                          item]).astype(float)
                            _increment_diagonal_(laplac, self.heta)
                            laplac = inv(laplac)
                        else:
                            laplac = L
                        Q[level][key]["l"] = laplac

                out.append((A, phi, phi_outer, Q, L))

            if self.verbose:
                print("Preprocessing took:", time.time() - start, "s.")
            if ng == 0:
                raise ValueError('parsed input is empty')

            return out
Beispiel #29
0
def read_data(name,
              with_classes=True,
              prefer_attr_nodes=False,
              prefer_attr_edges=False,
              produce_labels_nodes=False,
              as_graphs=False,
              is_symmetric=symmetric_dataset):
    """Create a dataset iterable for GraphKernel.

    Parameters
    ----------
    name : str
        The dataset name.

    with_classes : bool, default=False
        Return an iterable of class labels based on the enumeration.

    produce_labels_nodes : bool, default=False
        Produce labels for nodes if not found.
        Currently this means labeling its node by its degree inside the Graph.
        This operation is applied only if node labels are non existent.

    prefer_attr_nodes : bool, default=False
        If a dataset has both *node* labels and *node* attributes
        set as labels for the graph object for *nodes* the attributes.

    prefer_attr_edges : bool, default=False
        If a dataset has both *edge* labels and *edge* attributes
        set as labels for the graph object for *edge* the attributes.

    as_graphs : bool, default=False
        Return data as a list of Graph Objects.

    is_symmetric : bool, default=False
        Defines if the graph data describe a symmetric graph.

    Returns
    -------
    Gs : iterable
        An iterable of graphs consisting of a dictionary, node
        labels and edge labels for each graph.

    classes : np.array, case_of_appearance=with_classes==True
        An one dimensional array of graph classes aligned with the lines
        of the `Gs` iterable. Useful for classification.

    """
    indicator_path = "./" + str(name) + "/" + str(
        name) + "_graph_indicator.txt"
    edges_path = "./" + str(name) + "/" + str(name) + "_A.txt"
    node_labels_path = "./" + str(name) + "/" + str(name) + "_node_labels.txt"
    # node_labels_path = "./" + str(name) + "/" + str(name) + "_label_pro.txt"
    node_attributes_path = "./" + str(name) + "/" + str(
        name) + "_node_attributes.txt"
    edge_labels_path = "./" + str(name) + "/" + str(name) + "_edge_labels.txt"
    edge_attributes_path = \
        "./" + str(name) + "/" + str(name) + "_edge_attributes.txt"
    graph_classes_path = \
        "./" + str(name) + "/" + str(name) + "_graph_labels.txt"

    # node graph correspondence
    ngc = dict()
    # edge line correspondence
    elc = dict()
    # dictionary that keeps sets of edges
    Graphs = dict()
    # dictionary of labels for nodes
    node_labels = dict()

    # dictionary of labels for edges
    edge_labels = dict()

    # Associate graphs nodes with indexes
    with open(indicator_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            ngc[i] = int(line[:-1])
            if int(line[:-1]) not in Graphs:
                Graphs[int(line[:-1])] = set()
            if int(line[:-1]) not in node_labels:
                node_labels[int(line[:-1])] = dict()
            if int(line[:-1]) not in edge_labels:
                edge_labels[int(line[:-1])] = dict()

    # Extract graph edges
    with open(edges_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            edge = line[:-1].replace(' ', '').split(",")
            elc[i] = (int(edge[0]), int(edge[1]))
            Graphs[ngc[int(edge[0])]].add((int(edge[0]), int(edge[1])))
            if is_symmetric:
                Graphs[ngc[int(edge[1])]].add((int(edge[1]), int(edge[0])))

    # Extract node attributes
    if (prefer_attr_nodes and dataset_metadata[name].get(
            "na", os.path.exists(node_attributes_path))):
        with open(node_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = \
                    [float(num) for num in
                     line[:-1].replace(' ', '').split(",")]
    # Extract node labels
    elif dataset_metadata[name].get("nl", os.path.exists(node_labels_path)):
        with open(node_labels_path, "r") as f:
            # m = []
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = int(line.split(' ')[0][:-1])
                # node_labels_pro[ngc[i]][i] = float(line.split(' ')[1][:-1])
    elif produce_labels_nodes:
        for i in range(1, len(Graphs) + 1):
            node_labels[i] = dict(Counter(s for (s, d) in Graphs[i] if s != d))

    # Extract edge attributes
    if (prefer_attr_edges and dataset_metadata[name].get(
            "ea", os.path.exists(edge_attributes_path))):
        with open(edge_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                attrs = [
                    float(num) for num in line[:-1].replace(' ', '').split(",")
                ]
                edge_labels[ngc[elc[i][0]]][elc[i]] = attrs
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = attrs

    # Extract edge labels
    elif dataset_metadata[name].get("el", os.path.exists(edge_labels_path)):
        with open(edge_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                edge_labels[ngc[elc[i][0]]][elc[i]] = int(line[:-1])
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = \
                        int(line[:-1])

    path = str(name)
    node_label_list = []
    node_label_pro_list = []
    file = open(
        'D:/Projects/PyProjects/Motif_Entropy/data/' + path + "/" + path +
        '_label_pro.txt', 'r')
    for line in file.readlines():
        line = line.strip()
        k = int(line.split(' ')[0])
        v = float(line.split(' ')[1])
        # print(k,"  ",v)
        node_label_list.append(k)
        node_label_pro_list.append(v)
    # print(len(node_label_list)," ",len(node_label_pro_list))
    f.close()
    # print(node_labels)

    count_len = []
    for i in range(1, len(Graphs) + 1):
        count_len.append(len(node_labels[i]))
    # print(len(count_len)," ",count_len)
    merge = [i for i in range(1, len(node_label_list) + 1)]
    node_label_pro_dict = dict(zip(merge, node_label_pro_list))

    # 拆分字典,每张图对应节点的概率
    label_pro_list = []
    it = iter(node_label_pro_dict)
    for i in range(0, len(Graphs)):
        label_pro_list.append(
            {k: node_label_pro_dict[k]
             for k in islice(it, count_len[i])})
    # print(label_pro_list)
    count = 0
    for i in range(1, len(Graphs) + 1):
        for j in range(count + 1, count + count_len[i - 1] + 1):
            # print(node_labels[i][j])
            # print(j)
            # print(count_len[i - 1])
            node_labels[i][j] += node_labels[i][j] * round(
                label_pro_list[i - 1][j], 2)
        count += count_len[i - 1]
    # print(new[187])
    Gs = list()
    if as_graphs:
        for i in range(1, len(Graphs) + 1):
            # print(Graphs[i]," ",node_labels[i]," ",edge_labels[i])
            Gs.append(
                Graph(Graphs[i], node_labels[i], edge_labels[i],
                      label_pro_list[i - 1]))

    else:
        # print(Graphs[1], " ", node_labels[1], " ", edge_labels[1]," ", label_pro_list[0])
        for i in range(1, len(Graphs) + 1):
            Gs.append([
                Graphs[i], node_labels[i], edge_labels[i],
                label_pro_list[i - 1]
            ])
            # print(node_labels[i])
    # print(Gs[0][1])
    if with_classes:
        classes = []
        with open(graph_classes_path, "r") as f:
            for line in f:
                classes.append(int(line[:-1]))

        classes = np.array(classes, dtype=np.int)
        return Bunch(data=Gs, target=classes)
    else:
        return Bunch(data=Gs)
Beispiel #30
0
    def parse_input(self, X):
        """Parse input for weisfeiler lehman optimal assignment.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        Hs : numpy array, shape = [n_input_graphs, hierarchy_size]
            An array where the rows contain the histograms of the graphs.

        """
        if self._method_calling not in [1, 2]:
            raise ValueError('method call must be called either from fit ' +
                             'or fit-transform')
        elif hasattr(self, '_X_diag'):
            # Clean _X_diag value
            delattr(self, '_X_diag')

        # Input validation and parsing
        if not isinstance(X, collections.Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            nx = 0
            Gs_ed, L, distinct_values = dict(), dict(), set()
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, collections.Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element on index: '
                                      + str(idx))
                        continue
                    else:
                        if len(x) > 2:
                            extra = tuple()
                            if len(x) > 3:
                                extra = tuple(x[3:])
                            x = Graph(x[0], x[1], x[2], graph_format=self._graph_format)
                            extra = (x.get_labels(purpose=self._graph_format,
                                                  label_type="edge", return_none=True), ) + extra
                        else:
                            x = Graph(x[0], x[1], {}, graph_format=self._graph_format)
                            extra = tuple()

                elif type(x) is Graph:
                    x.desired_format(self._graph_format)
                else:
                    raise TypeError('each element of X must be either a ' +
                                    'graph object or a list with at least ' +
                                    'a graph like object and node labels ' +
                                    'dict \n')
                Gs_ed[nx] = x.get_edge_dictionary()
                L[nx] = x.get_labels(purpose="dictionary")
                distinct_values |= set(itervalues(L[nx]))
                nx += 1
            if nx == 0:
                raise ValueError('parsed input is empty')

        # Save the number of "fitted" graphs.
        self._nx = nx

        # Initialize hierarchy
        self._hierarchy = dict()
        self._hierarchy['root'] = dict()
        self._hierarchy['root']['parent'] = None
        self._hierarchy['root']['children'] = list()
        self._hierarchy['root']['w'] = 0
        self._hierarchy['root']['omega'] = 0

        # get all the distinct values of current labels
        WL_labels_inverse = dict()

        # assign a number to each label
        label_count = 0
        for dv in sorted(list(distinct_values)):
            WL_labels_inverse[dv] = label_count
            self._insert_into_hierarchy(label_count, 'root')
            label_count += 1

        # Initalize an inverse dictionary of labels for all iterations
        self._inv_labels = dict()
        self._inv_labels[0] = WL_labels_inverse

        for j in range(nx):
            new_labels = dict()
            for k in L[j].keys():
                new_labels[k] = WL_labels_inverse[L[j][k]]
            L[j] = new_labels

        for i in range(1, self._n_iter):
            new_previous_label_set, WL_labels_inverse, L_temp = set(), dict(), dict()
            for j in range(nx):
                # Find unique labels and sort
                # them for both graphs
                # Keep for each node the temporary
                L_temp[j] = dict()
                for v in Gs_ed[j].keys():
                    credential = str(L[j][v]) + "," + \
                        str(sorted([L[j][n] for n in Gs_ed[j][v].keys()]))
                    L_temp[j][v] = credential
                    new_previous_label_set.add((credential, L[j][v]))

            label_list = sorted(list(new_previous_label_set), key=lambda tup: tup[0])
            for dv, previous_label in label_list:
                WL_labels_inverse[dv] = label_count
                self._insert_into_hierarchy(label_count, previous_label)
                label_count += 1

            # Recalculate labels
            for j in range(nx):
                new_labels = dict()
                for k in L_temp[j].keys():
                    new_labels[k] = WL_labels_inverse[L_temp[j][k]]
                L[j] = new_labels
            self._inv_labels[i] = WL_labels_inverse

        # Compute the vector representation of each graph
        if self.sparse:
            Hs = lil_matrix((nx, len(self._hierarchy)))
        else:
            Hs = np.zeros((nx, len(self._hierarchy)))
        for j in range(nx):
            for k in L[j].keys():
                current_label = L[j][k]
                while self._hierarchy[current_label]['parent'] is not None:
                    Hs[j, current_label] += self._hierarchy[current_label]['omega']
                    current_label = self._hierarchy[current_label]['parent']

        return Hs