def parse_input(self, X): """Parse and create features for svm_theta kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list The lovasz metrics for the given input. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 out = list() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): x, is_iter = list(x), True if is_iter and len(x) in [0, 1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], {}, {}, self._graph_format) elif type(x) is not Graph: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 1 ' + 'and at most 3 elements\n') i += 1 A = x.get_adjacency_matrix() dual_coeffs = _calculate_svm_theta_(A) out.append(self._calculate_svm_theta_levels_(A, dual_coeffs)) if i == 0: raise ValueError('parsed input is empty') return out
def parse_input(self, X): """Parse and create features for graphlet_sampling kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list The extracted adjacency matrices for any given input. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 proc = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element' + ' on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 2 ' + 'and at most 3 elements\n') i += 1 x.desired_format("adjacency") Ax = x.get_adjacency_matrix() Lx = x.get_labels(purpose="adjacency") Lx = [Lx[idx] for idx in range(Ax.shape[0])] proc.append((Ax, Lx, Ax.shape[0])) out = list() for Ax, Lx, s in proc: amss = dict() labels = set(Lx) Lx = np.array(Lx) for t in product(labels, labels): selector = np.matmul(np.expand_dims(Lx == t[0], axis=1), np.expand_dims(Lx == t[1], axis=0)) amss[t] = Ax * selector out.append((amss, s)) if i == 0: raise ValueError('parsed input is empty') return out
def parse_input(self, X): """Parse input and create features, while initializing and/or calculating sub-kernels. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- base_graph_kernel : object Returns base_graph_kernel. Only if called from `fit` or `fit_transform`. K : np.array Returns the kernel matrix. Only if called from `transform` or `fit_transform`. """ # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx, max_core_number, core_numbers, graphs = 0, 0, [], [] for (idx, x) in enumerate(iter(X)): is_iter = False extra = tuple() if isinstance(x, collections.Iterable): x, is_iter = list(x), True if is_iter and len(x) >= 0: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif len(x) == 1: x = Graph(x[0], {}, {}, graph_format="adjacency") elif len(x) == 2: x = Graph(x[0], x[1], {}, graph_format="adjacency") elif len(x) >= 3: if len(x) > 3: extra += tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format="adjacency") elif type(x) is Graph: x.desired_format("adjacency") x = Graph( x.get_adjacency_matrix(), x.get_labels(purpose="adjacency", label_type="vertex", return_none=True), x.get_labels(purpose="adjacency", label_type="edge", return_none=True)) else: raise TypeError('each element of X must be either a ' 'graph object or a list with at least ' 'a graph like object and node labels ' 'dict \n') # workaround for leaving a sparse representation for x x.change_format(self._graph_format) c = core_number(x) max_core_number = max(max_core_number, max(c.values())) core_numbers.append(c) graphs.append((x, extra)) nx += 1 if nx == 0: raise ValueError('parsed input is empty') if max_core_number <= self.min_core: raise ValueError( 'The maximum core equals the min_core boundary set in init.') # Add the zero iteration element if self._method_calling == 2: K = np.zeros(shape=(nx, nx)) elif self._method_calling == 3: self._dummy_kernel = dict() K = np.zeros(shape=(nx, self._nx)) # Main base_graph_kernel, indexes_list = dict(), dict() for i in range(max_core_number, self.min_core, -1): subgraphs, indexes = list(), list() for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers, graphs)): vertices = [k for k, v in iteritems(cn) if v >= i] if len(vertices) > 0: # Calculate subgraph and store the index of the non-empty vertices sg = g.get_subgraph(vertices) sub_extra = list() indexes.append(idx) if len(extra) > 0: vs = np.array(sg.get_vertices(purpose='any')) for e in extra: # This case will only be reached by now if the user add the propagation # kernel as subkernel with a custom propagation matrix. This is a workaround! if type(e) is np.array and len(e.shape) == 2: e = e[vs, :][:, vs] sub_extra.append(e) subgraphs.append((sg, ) + tuple(sub_extra)) else: subgraphs.append(sg) indexes = np.array(indexes) indexes_list[i] = indexes # calculate kernel if self._method_calling == 1 and indexes.shape[0] > 0: base_graph_kernel[i] = self.base_graph_kernel_(**self.params_) base_graph_kernel[i].fit(subgraphs) elif self._method_calling == 2 and indexes.shape[0] > 0: base_graph_kernel[i] = self.base_graph_kernel_(**self.params_) ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs) for j in range(indexes.shape[0]): K[indexes[j], indexes] += ft_subgraph_mat[j, :] elif self._method_calling == 3: if self._max_core_number < i or self._fit_indexes[i].shape[ 0] == 0: if len(indexes) > 0: # add a dummy kernel for calculating the diagonal self._dummy_kernel[i] = self.base_graph_kernel_( **self.params_) self._dummy_kernel[i].fit(subgraphs) else: if indexes.shape[0] > 0: subgraph_tmat = self.X[i].transform(subgraphs) for j in range(indexes.shape[0]): K[indexes[j], self._fit_indexes[i]] += subgraph_tmat[j, :] if self._method_calling == 1: self._nx = nx self._max_core_number = max_core_number self._fit_indexes = indexes_list return base_graph_kernel elif self._method_calling == 2: self._nx = nx self._max_core_number = max_core_number self._fit_indexes = indexes_list return K, base_graph_kernel elif self._method_calling == 3: self._t_nx = nx self._max_core_number_trans = max_core_number self._transform_indexes = indexes_list return K
def parse_input(self, X): """Parse and check the given input for the Graph Hopper kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). Returns ------- out : np.array, shape=(len(X), n_labels) A np array for frequency (cols) histograms for all Graphs (rows). """ if not isinstance(X, Iterable): raise TypeError('input must be an iterable\n') else: ni = 0 diam = list() graphs = list() for (i, x) in enumerate(iter(X)): is_iter = False if isinstance(x, Iterable): is_iter = True x = list(x) if type(x) is Graph: g = Graph(x.get_adjacency_matrix(), x.get_labels(purpose="adjacency"), {}, self._graph_format) elif is_iter and len(x) == 0 or len(x) >= 2: if len(x) == 0: warn('Ignoring empty element on index: ' + str(i)) continue elif len(x) >= 2: g = Graph(x[0], x[1], {}, "adjacency") g.change_format(self._graph_format) else: raise TypeError('each element of X must be either a ' 'graph object or a list with at least ' 'a graph like object and node, ') spm, attr = g.build_shortest_path_matrix(labels="vertex") nv = g.nv() try: attributes = np.array([attr[j] for j in range(nv)]) except TypeError: raise TypeError( 'All attributes of a single graph should have the same dimension.' ) diam.append(int(np.max(spm[spm < float("Inf")]))) graphs.append((g.get_adjacency_matrix(), nv, attributes)) ni += 1 if self._method_calling == 1: max_diam = self._max_diam = max(diam) + 1 else: max_diam = max(self._max_diam, max(diam) + 1) out = list() for i in range(ni): AM, node_nr, attributes = graphs[i] des = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int) occ = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int) # Convert adjacency matrix to dictionary idx_i, idx_j = np.where(AM > 0) ed = defaultdict(dict) for (a, b) in filterfalse(lambda a: a[0] == a[1], zip(idx_i, idx_j)): ed[a][b] = AM[a, b] for j in range(node_nr): A = np.zeros(shape=AM.shape) # Single-source shortest path from node j D, p = dijkstra(ed, j) D = np.array( list(D.get(k, float("Inf")) for k in range(node_nr))) p[j] = -1 # Restrict to the connected component of node j conn_comp = np.where(D < float("Inf"))[0] # To-be DAG adjacency matrix of connected component of node j A_cc = A[conn_comp, :][:, conn_comp] # Adjacency matrix of connected component of node j AM_cc = AM[conn_comp, :][:, conn_comp] D_cc = D[conn_comp] conn_comp_converter = np.zeros(shape=(A.shape[0], 1), dtype=int) for k in range(conn_comp.shape[0]): conn_comp_converter[conn_comp[k]] = k conn_comp_converter = np.vstack([0, conn_comp_converter]) p_cc = conn_comp_converter[ np.array(list(p[k] for k in conn_comp)) + 1] # Number of nodes in connected component of node j conncomp_node_nr = A_cc.shape[0] for v in range(conncomp_node_nr): if p_cc[v] > 0: # Generate A_cc by adding directed edges of form (parent(v), v) A_cc[p_cc[v], v] = 1 # Distance from v to j v_dist = D_cc[v] # All neighbors of v in the undirected graph v_nbs = np.where(AM_cc[v, :] > 0)[0] # Distances of neighbors of v to j v_nbs_dists = D_cc[v_nbs] # All neighbors of v in undirected graph who are # one step closer to j than v is; i.e. SP-DAG parents v_parents = v_nbs[v_nbs_dists == (v_dist - 1)] # Add SP-DAG parents to A_cc A_cc[v_parents, v] = 1 # Computes the descendants & occurence vectors o_j(v), d_j(v) # for all v in the connected component occ_p, des_p = od_vectors_dag(A_cc, D_cc) if des_p.shape[0] == 1 and j == 0: des[j, 0, 0] = des_p occ[j, 0, 0] = occ_p else: # Convert back to the indices of the original graph for v in range(des_p.shape[0]): for l in range(des_p.shape[1]): des[j, conn_comp[v], l] = des_p[v, l] # Convert back to the indices of the original graph for v in range(occ_p.shape[0]): for l in range(occ_p.shape[1]): occ[j, conn_comp[v], l] = occ_p[v, l] M = np.zeros(shape=(node_nr, max_diam, max_diam)) # j loops through choices of root for j in range(node_nr): des_mat_j_root = np.squeeze(des[j, :, :]) occ_mat_j_root = np.squeeze(occ[j, :, :]) # v loops through nodes for v in range(node_nr): for a in range(max_diam): for b in range(a, max_diam): # M[v,:,:] is M[v]; a = node coordinate in path, b = path length M[v, a, b] += des_mat_j_root[v, b - a] * occ_mat_j_root[v, a] if self.calculate_norm_: out.append((M, attributes, np.sum(attributes**2, axis=1))) else: out.append((M, attributes)) return out
def parse_input(self, X): """Parse and create features for the attributed propation kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- local_values : dict A dictionary of pairs between each input graph and a bins where the sampled graphlets have fallen. """ if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: # The number of parsed graphs n = 0 transition_matrix = dict() indexes = [0] Attr = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3, 4]: if len(x) == 0: warnings.warn('Ignoring empty element on ' + 'index: ' + str(idx)) continue if len(x) == 2 and type(x[0]) is Graph: g, T = x else: g = Graph(x[0], x[1], {}, self._graph_format) if len(x) == 4: T = x[3] else: T = None elif type(x) is Graph: g, T = x, None else: raise ValueError('Each element of X must be either a ' + 'Graph or an iterable with at least 2 ' + 'and at most 4 elements\n') if T is not None: if T.shape[0] != T.shape[1]: raise TypeError('Transition matrix on index' + ' ' + str(idx) + 'must be ' + 'a square matrix.') if T.shape[0] != g.nv(): raise TypeError('Propagation matrix must ' + 'have the same dimension ' + 'as the number of vertices.') else: T = g.get_adjacency_matrix() nv = g.nv() transition_matrix[n] = (T.T / np.sum(T, axis=1)).T attr = g.get_labels(purpose="adjacency") try: attributes = np.array([attr[j] for j in range(nv)]) except TypeError: raise TypeError( 'All attributes of a single graph should have the same dimension.' ) Attr.append(attributes) indexes.append(indexes[-1] + nv) n += 1 try: P = np.vstack(Attr) except ValueError: raise ValueError( 'Attribute dimensions should be the same, for all graphs') if self._method_calling == 1: self._dim = P.shape[1] else: if self._dim != P.shape[1]: raise ValueError('transform attribute vectors should' 'have the same dimension as in fit') if n == 0: raise ValueError('Parsed input is empty') # feature vectors if self._method_calling == 1: # simple normal self._u, self._b, self._hd = list(), list(), list() for t in range(self.t_max): u = self.random_state_.randn(self._dim) if self.take_cauchy_: # cauchy u = np.divide(u, self.random_state_.randn(self._dim)) self._u.append(u) # random offset self._b.append(self.w * self.random_state_.randn(self._dim)) phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]).tolist() hd = { j: i for i, j in enumerate({tuple(l) for l in hashes}) } self._hd.append(hd) features = np.array([hd[tuple(l)] for l in hashes]) # Accumulate the results. for k in range(n): phi[k][t] = Counter( features[indexes[k]:indexes[k + 1]].flat) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)] if self._method_calling == 3: phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]).tolist() hd = dict( chain( iteritems(self._hd[t]), iter((j, i) for i, j in enumerate( filterfalse(lambda x: x in self._hd[t], {tuple(l) for l in hashes}), len(self._hd[t]))))) features = np.array([hd[tuple(l)] for l in hashes]) # Accumulate the results. for k in range(n): phi[k][t] = Counter(features[indexes[k]:indexes[k + 1]]) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)]
def parse_input(self, X): """Parse and create features for the propation kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- local_values : dict A dictionary of pairs between each input graph and a bins where the sampled graphlets have fallen. """ if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: i = -1 transition_matrix = dict() labels = set() L = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3, 4]: if len(x) == 0: warnings.warn('Ignoring empty element on ' + 'index: ' + str(idx)) continue if len(x) == 2 and type(x[0]) is Graph: g, T = x else: g = Graph(x[0], x[1], {}, self._graph_format) if len(x) == 4: T = x[3] else: T = None elif type(x) is Graph: g, T = x, None else: raise ValueError('Each element of X must be either a ' + 'Graph or an iterable with at least 2 ' + 'and at most 4 elements\n') if T is not None: if T.shape[0] != T.shape[1]: raise TypeError('Transition matrix on index' + ' ' + str(idx) + 'must be ' + 'a square matrix.') if T.shape[0] != g.nv(): raise TypeError('Propagation matrix must ' + 'have the same dimension ' + 'as the number of vertices.') else: T = g.get_adjacency_matrix() i += 1 transition_matrix[i] = (T.T / np.sum(T, axis=1)).T label = g.get_labels(purpose='adjacency') try: labels |= set(itervalues(label)) except TypeError: raise TypeError( 'For a non attributed kernel, labels should be hashable.' ) L.append((g.nv(), label)) if i == -1: raise ValueError('Parsed input is empty') # The number of parsed graphs n = i + 1 # enumerate labels if self._method_calling == 1: enum_labels = {l: i for (i, l) in enumerate(list(labels))} self._enum_labels = enum_labels self._parent_labels = labels elif self._method_calling == 3: new_elements = labels - self._parent_labels if len(new_elements) > 0: new_enum_labels = iter((l, i) for (i, l) in enumerate( list(new_elements), len(self._enum_labels))) enum_labels = dict( chain(iteritems(self._enum_labels), new_enum_labels)) else: enum_labels = self._enum_labels # make a matrix for all graphs that contains label vectors P, data, indexes = dict(), list(), [0] for (k, (nv, label)) in enumerate(L): data += [(indexes[-1] + j, enum_labels[label[j]]) for j in range(nv)] indexes.append(indexes[-1] + nv) # Initialise the on hot vector rows, cols = zip(*data) P = np.zeros(shape=(indexes[-1], len(enum_labels))) P[rows, cols] = 1 dim_orig = len(self._enum_labels) # feature vectors if self._method_calling == 1: # simple normal self._u, self._b, self._hd = list(), list(), list() for t in range(self.t_max): u = self.random_state_.randn(len(enum_labels)) if self.take_cauchy_: # cauchy u = np.divide( u, self.random_state_.randn(len(enum_labels))) self._u.append(u) # random offset self._b.append(self.w * self.random_state_.rand()) phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]) hd = dict( (j, i) for i, j in enumerate(set(np.unique(hashes)))) self._hd.append(hd) features = np.vectorize(lambda i: hd[i])(hashes) # Accumulate the results. for k in range(n): phi[k][t] = Counter(features[indexes[k]:indexes[k + 1]]) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)] elif (self._method_calling == 3 and dim_orig >= len(enum_labels)): phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]) hd = dict( chain( iteritems(self._hd[t]), iter((j, i) for i, j in enumerate( filterfalse(lambda x: x in self._hd[t], np.unique(hashes)), len(self._hd[t]))))) features = np.vectorize(lambda i: hd[i])(hashes) # Accumulate the results. for k in range(n): phi[k][t] = Counter(features[indexes[k]:indexes[k + 1]]) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)] else: cols = np.array(cols) vertices = np.where(cols < dim_orig)[0] vertices_p = np.where(cols >= dim_orig)[0] nnv = len(enum_labels) - dim_orig phi = {k: dict() for k in range(n)} for t in range(self.t_max): # hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P[vertices, :dim_orig], self._u[t], self._b[t]) hd = dict( chain( iteritems(self._hd[t]), iter((j, i) for i, j in enumerate( filterfalse(lambda x: x in self._hd[t], np.unique(hashes)), len(self._hd[t]))))) features = np.vectorize(lambda i: hd[i], otypes=[int])(hashes) # for each the new labels graph hash P and produce the feature vectors u = self.random_state_.randn(nnv) if self.take_cauchy_: # cauchy u = np.divide(u, self.random_state_.randn(nnv)) u = np.hstack((self._u[t], u)) # calculate hashes for the remaining hashes = self.calculate_LSH(P[vertices_p, :], u, self._b[t]) hd = dict( chain( iteritems(hd), iter((j, i) for i, j in enumerate(hashes, len(hd))))) features_p = np.vectorize(lambda i: hd[i], otypes=[int])(hashes) # Accumulate the results for k in range(n): A = Counter(features[np.logical_and( indexes[k] <= vertices, vertices <= indexes[k + 1])]) B = Counter(features_p[np.logical_and( indexes[k] <= vertices_p, vertices_p <= indexes[k + 1])]) phi[k][t] = A + B # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) Q = np.all(P[:, dim_orig:] > 0, axis=1) vertices = np.where(~Q)[0] vertices_p = np.where(Q)[0] return [phi[k] for k in range(n)]
def parse_input(self, X): """Parse and create features for pyramid_match kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- H : list A list of lists of Histograms for all levels for each graph. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 Us = [] if self.with_labels: Ls = [] for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or (len(x) >= 1 and not self.with_labels) or (len(x) >= 2 and self.with_labels)): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif not self.with_labels: x = Graph(x[0], {}, {}, self._graph_format) else: x = Graph(x[0], x[1], {}, self._graph_format) elif not type(x) is Graph: raise TypeError( 'each element of X must be either a graph object or a list with ' 'at least a graph like object and node labels dict \n') A = x.get_adjacency_matrix() if self.with_labels: L = x.get_labels(purpose="adjacency") i += 1 if A.shape[0] == 0: Us.append(np.zeros((1, self.d))) else: # Perform eigenvalue decomposition. # Rows of matrix U correspond to vertex representations # Embed vertices into the d-dimensional space if A.shape[0] > self.d + 1: # If size of graph smaller than d, pad with zeros Lambda, U = eigs(csr_matrix(A, dtype=np.float), k=self.d, ncv=10 * self.d) idx = Lambda.argsort()[::-1] U = U[:, idx] else: Lambda, U = np.linalg.eig(A) idx = Lambda.argsort()[::-1] U = U[:, idx] U = U[:, :self.d] # Replace all components by their absolute values U = np.absolute(U) Us.append((A.shape[0], U)) if self.with_labels: Ls.append(L) if i == 0: raise ValueError('parsed input is empty') if self.with_labels: # Map labels to values between 0 and |L|-1 # where |L| is the number of distinct labels if self._method_calling in [1, 2]: self._num_labels = 0 self._labels = set() for L in Ls: self._labels |= set(itervalues(L)) self._num_labels = len(self._labels) self._labels = {l: i for (i, l) in enumerate(self._labels)} return self._histogram_calculation(Us, Ls, self._labels) elif self._method_calling == 3: labels = set() for L in Ls: labels |= set(itervalues(L)) rest_labels = labels - set(self._labels.keys()) nouveau_labels = dict( chain(iteritems(self._labels), ((j, i) for ( i, j) in enumerate(rest_labels, len(self._labels))))) return self._histogram_calculation(Us, Ls, nouveau_labels) else: return self._histogram_calculation(Us)
def parse_input(self, X): """Fast ML Graph Kernel. See supplementary material :cite:`kondor2016multiscale`, algorithm 1. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list A list of tuples with S matrices inverses and their 4th-root determinants. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: ng = 0 out = list() data = dict() neighborhoods = dict() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' 'graph or an iterable with at least 1 ' 'and at most 3 elements\n') phi_d = x.get_labels() A = x.get_adjacency_matrix() try: phi = np.array([list(phi_d[i]) for i in range(A.shape[0])]) except TypeError: raise TypeError('Features must be iterable and castable ' 'in total to a numpy array.') Lap = laplacian(A).astype(float) _increment_diagonal_(Lap, self.heta) data[ng] = {0: A, 1: phi, 2: inv(Lap)} neighborhoods[ng] = x ng += 1 if ng == 0: raise ValueError('parsed input is empty') # Define a function for calculating the S's of subgraphs of each iteration def calculate_C(k, j, l): if type(neighborhoods[k]) is Graph: neighborhoods[k] = neighborhoods[k].produce_neighborhoods( r=self.L, sort_neighbors=False) indexes = neighborhoods[k][l][j] L = laplacian(data[k][0][indexes, :][:, indexes]).astype(float) _increment_diagonal_(L, self.heta) U = data[k][1][indexes, :] S = multi_dot((U.T, inv(L), U)) _increment_diagonal_(S, self.gamma) return (inv(S), np.sum(np.log(np.real(eigvals(S))))) if self._method_calling == 1: V = [(k, j) for k in range(ng) for j in range(data[k][0].shape[0])] ns = min(len(V), self.n_samples) self.random_state_.shuffle(V) vs = V[:ns] phi_k = np.array([data[k][1][j, :] for (k, j) in vs]) # w the eigen vectors, v the eigenvalues K = phi_k.dot(phi_k.T) # Calculate eigenvalues v, w = eig(K) v, w = np.real(v), np.real(w.T) # keep only the positive vpos = np.argpartition(v, -self.P)[-self.P:] vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)] # ksi.shape = (k, Ns) * (Ns, P) ksi = w[vpos].dot(phi_k).T / np.sqrt(v[vpos]) for j in range(ng): # (n_samples, k) * (k, P) data[j][1] = data[j][1].dot(ksi) self._data_level = {0: ksi} for l in range(1, self.L + 1): # Take random samples from all the vertices of all graphs self.random_state_.shuffle(V) vs = V[:ns] # Compute the reference subsampled Gram matrix K_proj = { k: np.zeros(shape=(data[k][0].shape[0], ns)) for k in range(ng) } K, C = np.zeros(shape=(len(vs), len(vs))), dict() for (m, (k, j)) in enumerate(vs): C[m] = calculate_C(k, j, l) K_proj[k][j, m] = K[m, m] = self.pairwise_operation( C[m], C[m]) for (s, (k2, j2)) in enumerate(vs): if s < m: K[s, m] = K[m, s] \ = K_proj[k2][j2, m] \ = K_proj[k][j, s] \ = self.pairwise_operation(C[s], C[m]) else: break # Compute the kernels of the relations of the reference to everything else for (k, j) in V[ns:]: for (m, _) in enumerate(vs): K_proj[k][j, m] = self.pairwise_operation( C[m], calculate_C(k, j, l)) # w the eigen vectors, v the eigenvalues v, w = eig(K) v, w = np.real(v), np.real(w.T) # keep only the positive vpos = np.argpartition(v, -self.P)[-self.P:] vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)] # Q shape=(k, P) Q = w[vpos].T / np.sqrt(v[vpos]) for j in range(ng): # (n, ns) * (ns, P) data[j][1] = K_proj[j].dot(Q) self._data_level[l] = (C, Q) elif self._method_calling == 3: ksi = self._data_level[0] for j in range(ng): # (n, k) * (k, P) data[j][1] = data[j][1].dot(ksi) for l in range(1, self.L + 1): C, Q = self._data_level[l] for j in range(ng): K_proj = np.zeros(shape=(data[j][0].shape[0], len(C))) for n in range(data[j][0].shape[0]): for m in range(len(C)): K_proj[n, m] = self.pairwise_operation( C[m], calculate_C(j, n, l)) data[j][1] = K_proj.dot(Q) # Apply the final calculation of S. for k in range(ng): S = multi_dot((data[k][1].T, data[k][2], data[k][1])) _increment_diagonal_(S, self.gamma) out.append((inv(S), np.sum(np.log(np.real(eigvals(S)))))) return out
def parse_input(self, X): """Parse and create features for multiscale_laplacian kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list Tuples consisting of the Adjacency matrix, phi, phi_outer dictionary of neihborhood indexes and inverse laplacians up to level self.L and the inverse Laplacian of A. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: ng = 0 out = list() start = time.time() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 1 ' + 'and at most 3 elements\n') ng += 1 phi_d = x.get_labels() A = x.get_adjacency_matrix() N = x.produce_neighborhoods(r=self.L, sort_neighbors=False) try: phi = np.array([list(phi_d[i]) for i in range(A.shape[0])]) except TypeError: raise TypeError('Features must be iterable and castable ' + 'in total to a numpy array.') phi_outer = np.dot(phi, phi.T) Lap = laplacian(A).astype(float) _increment_diagonal_(Lap, self.heta) L = inv(Lap) Q = dict() for level in range(1, self.L + 1): Q[level] = dict() for (key, item) in iteritems(N[level]): Q[level][key] = dict() Q[level][key]["n"] = np.array(item) if len(item) < A.shape[0]: laplac = laplacian(A[item, :][:, item]).astype(float) _increment_diagonal_(laplac, self.heta) laplac = inv(laplac) else: laplac = L Q[level][key]["l"] = laplac out.append((A, phi, phi_outer, Q, L)) if self.verbose: print("Preprocessing took:", time.time() - start, "s.") if ng == 0: raise ValueError('parsed input is empty') return out
def parse_input(self, X): """Parse and create features for lovasz_theta kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list The lovasz metrics for the given input. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 adjm = list() max_dim = 0 for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): x, is_iter = list(x), True if is_iter and len(x) in [0, 1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], {}, {}, self._graph_format) elif type(x) is not Graph: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 1 ' + 'and at most 3 elements\n') i += 1 A = x.get_adjacency_matrix() adjm.append(A) max_dim = max(max_dim, A.shape[0]) if self._method_calling == 1: if self.d_ is None: self.d_ = max_dim + 1 if self.d_ < max_dim + 1: if self.max_dim is None and self._method_calling == 3: raise ValueError( 'Maximum dimension of a graph in transform is bigger ' 'than the one found in fit. To avoid that use max_dim parameter.' ) else: raise ValueError('max_dim should correspond to the ' 'biggest graph inside the dataset') out = list() for A in adjm: X, t = _calculate_lovasz_embeddings_(A) U = _calculate_lovasz_labelling_(X, t, self.d_) out.append(self._calculate_MEC_(U)) if i == 0: raise ValueError('parsed input is empty') return out