def fit(self, X, y=None): """Fit a dataset, for a transformer. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). The train samples. y : None There is no need of a target in a transformer, yet the pipeline API requires this parameter. Returns ------- self : object Returns self. """ self._method_calling = 1 self._is_transformed = False # Input validation and parsing self.initialize() if X is None: raise ValueError('`fit` input cannot be None') else: if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') i = 0 out = list() gs = list() self._labels_hash_dict, labels_hash_set = dict(), set() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif len(x) == 1: warnings.warn( 'Ignoring empty element on index: ' + str(i) + '\nLabels must be provided.') else: x = Graph(x[0], x[1], {}, self._graph_format) vertices = list(x.get_vertices(purpose="any")) Labels = x.get_labels(purpose="any") elif type(x) is Graph: vertices = list(x.get_vertices(purpose="any")) Labels = x.get_labels(purpose="any") else: raise TypeError('each element of X must be either ' 'a graph object or a list with at ' 'least a graph like object and ' 'node labels dict \n') g = (vertices, Labels, {n: x.neighbors(n, purpose="any") for n in vertices}) # collect all the labels labels_hash_set |= set(itervalues(Labels)) gs.append(g) i += 1 if i == 0: raise ValueError('parsed input is empty') # Hash labels if len(labels_hash_set) > self._max_number: warnings.warn('Number of labels is smaller than' 'the biggest possible.. ' 'Collisions will appear on the ' 'new labels.') # If labels exceed the biggest possible size nl, nrl = list(), len(labels_hash_set) while nrl > self._max_number: nl += self.random_state_.choice(self._max_number, self._max_number, replace=False).tolist() nrl -= self._max_number if nrl > 0: nl += self.random_state_.choice(self._max_number, nrl, replace=False).tolist() # unify the collisions per element. else: # else draw n random numbers. nl = self.random_state_.choice(self._max_number, len(labels_hash_set), replace=False).tolist() self._labels_hash_dict = dict(zip(labels_hash_set, nl)) # for all graphs for vertices, labels, neighbors in gs: new_labels = {v: self._labels_hash_dict[l] for v, l in iteritems(labels)} g = (vertices, new_labels, neighbors,) gr = {0: self.NH_(g)} for r in range(1, self.R): gr[r] = self.NH_(gr[r-1]) # save the output for all levels out.append(gr) self.X = out # Return the transformer return self
def parse_input(self, X): """Parse input and create features, while initializing and/or calculating sub-kernels. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- base_graph_kernel : object Returns base_graph_kernel. Only if called from `fit` or `fit_transform`. K : np.array Returns the kernel matrix. Only if called from `transform` or `fit_transform`. """ # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx, max_core_number, core_numbers, graphs = 0, 0, [], [] for (idx, x) in enumerate(iter(X)): is_iter = False extra = tuple() if isinstance(x, collections.Iterable): x, is_iter = list(x), True if is_iter and len(x) >= 0: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif len(x) == 1: x = Graph(x[0], {}, {}, graph_format="adjacency") elif len(x) == 2: x = Graph(x[0], x[1], {}, graph_format="adjacency") elif len(x) >= 3: if len(x) > 3: extra += tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format="adjacency") elif type(x) is Graph: x.desired_format("adjacency") x = Graph( x.get_adjacency_matrix(), x.get_labels(purpose="adjacency", label_type="vertex", return_none=True), x.get_labels(purpose="adjacency", label_type="edge", return_none=True)) else: raise TypeError('each element of X must be either a ' 'graph object or a list with at least ' 'a graph like object and node labels ' 'dict \n') # workaround for leaving a sparse representation for x x.change_format(self._graph_format) c = core_number(x) max_core_number = max(max_core_number, max(c.values())) core_numbers.append(c) graphs.append((x, extra)) nx += 1 if nx == 0: raise ValueError('parsed input is empty') if max_core_number <= self.min_core: raise ValueError( 'The maximum core equals the min_core boundary set in init.') # Add the zero iteration element if self._method_calling == 2: K = np.zeros(shape=(nx, nx)) elif self._method_calling == 3: self._dummy_kernel = dict() K = np.zeros(shape=(nx, self._nx)) # Main base_graph_kernel, indexes_list = dict(), dict() for i in range(max_core_number, self.min_core, -1): subgraphs, indexes = list(), list() for (idx, (cn, (g, extra))) in enumerate(zip(core_numbers, graphs)): vertices = [k for k, v in iteritems(cn) if v >= i] if len(vertices) > 0: # Calculate subgraph and store the index of the non-empty vertices sg = g.get_subgraph(vertices) sub_extra = list() indexes.append(idx) if len(extra) > 0: vs = np.array(sg.get_vertices(purpose='any')) for e in extra: # This case will only be reached by now if the user add the propagation # kernel as subkernel with a custom propagation matrix. This is a workaround! if type(e) is np.array and len(e.shape) == 2: e = e[vs, :][:, vs] sub_extra.append(e) subgraphs.append((sg, ) + tuple(sub_extra)) else: subgraphs.append(sg) indexes = np.array(indexes) indexes_list[i] = indexes # calculate kernel if self._method_calling == 1 and indexes.shape[0] > 0: base_graph_kernel[i] = self.base_graph_kernel_(**self.params_) base_graph_kernel[i].fit(subgraphs) elif self._method_calling == 2 and indexes.shape[0] > 0: base_graph_kernel[i] = self.base_graph_kernel_(**self.params_) ft_subgraph_mat = base_graph_kernel[i].fit_transform(subgraphs) for j in range(indexes.shape[0]): K[indexes[j], indexes] += ft_subgraph_mat[j, :] elif self._method_calling == 3: if self._max_core_number < i or self._fit_indexes[i].shape[ 0] == 0: if len(indexes) > 0: # add a dummy kernel for calculating the diagonal self._dummy_kernel[i] = self.base_graph_kernel_( **self.params_) self._dummy_kernel[i].fit(subgraphs) else: if indexes.shape[0] > 0: subgraph_tmat = self.X[i].transform(subgraphs) for j in range(indexes.shape[0]): K[indexes[j], self._fit_indexes[i]] += subgraph_tmat[j, :] if self._method_calling == 1: self._nx = nx self._max_core_number = max_core_number self._fit_indexes = indexes_list return base_graph_kernel elif self._method_calling == 2: self._nx = nx self._max_core_number = max_core_number self._fit_indexes = indexes_list return K, base_graph_kernel elif self._method_calling == 3: self._t_nx = nx self._max_core_number_trans = max_core_number self._transform_indexes = indexes_list return K
def parse_input(self, X): """Parse and create features for the attributed propation kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- local_values : dict A dictionary of pairs between each input graph and a bins where the sampled graphlets have fallen. """ if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: # The number of parsed graphs n = 0 transition_matrix = dict() indexes = [0] Attr = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3, 4]: if len(x) == 0: warnings.warn('Ignoring empty element on ' + 'index: ' + str(idx)) continue if len(x) == 2 and type(x[0]) is Graph: g, T = x else: g = Graph(x[0], x[1], {}, self._graph_format) if len(x) == 4: T = x[3] else: T = None elif type(x) is Graph: g, T = x, None else: raise ValueError('Each element of X must be either a ' + 'Graph or an iterable with at least 2 ' + 'and at most 4 elements\n') if T is not None: if T.shape[0] != T.shape[1]: raise TypeError('Transition matrix on index' + ' ' + str(idx) + 'must be ' + 'a square matrix.') if T.shape[0] != g.nv(): raise TypeError('Propagation matrix must ' + 'have the same dimension ' + 'as the number of vertices.') else: T = g.get_adjacency_matrix() nv = g.nv() transition_matrix[n] = (T.T / np.sum(T, axis=1)).T attr = g.get_labels(purpose="adjacency") try: attributes = np.array([attr[j] for j in range(nv)]) except TypeError: raise TypeError( 'All attributes of a single graph should have the same dimension.' ) Attr.append(attributes) indexes.append(indexes[-1] + nv) n += 1 try: P = np.vstack(Attr) except ValueError: raise ValueError( 'Attribute dimensions should be the same, for all graphs') if self._method_calling == 1: self._dim = P.shape[1] else: if self._dim != P.shape[1]: raise ValueError('transform attribute vectors should' 'have the same dimension as in fit') if n == 0: raise ValueError('Parsed input is empty') # feature vectors if self._method_calling == 1: # simple normal self._u, self._b, self._hd = list(), list(), list() for t in range(self.t_max): u = self.random_state_.randn(self._dim) if self.take_cauchy_: # cauchy u = np.divide(u, self.random_state_.randn(self._dim)) self._u.append(u) # random offset self._b.append(self.w * self.random_state_.randn(self._dim)) phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]).tolist() hd = { j: i for i, j in enumerate({tuple(l) for l in hashes}) } self._hd.append(hd) features = np.array([hd[tuple(l)] for l in hashes]) # Accumulate the results. for k in range(n): phi[k][t] = Counter( features[indexes[k]:indexes[k + 1]].flat) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)] if self._method_calling == 3: phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]).tolist() hd = dict( chain( iteritems(self._hd[t]), iter((j, i) for i, j in enumerate( filterfalse(lambda x: x in self._hd[t], {tuple(l) for l in hashes}), len(self._hd[t]))))) features = np.array([hd[tuple(l)] for l in hashes]) # Accumulate the results. for k in range(n): phi[k][t] = Counter(features[indexes[k]:indexes[k + 1]]) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)]
def transform(self, X): """Calculate the kernel matrix, between given and fitted dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. Returns ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 3 # Check is fit had been called check_is_fitted(self, ['X']) # Input validation and parsing if X is None: raise ValueError('`transform` input cannot be None') else: if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') i = 0 out = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif len(x) == 1: warnings.warn( 'Ignoring empty element on index: ' + str(i) + '\nLabels must be provided.') else: x = Graph(x[0], x[1], {}, self._graph_format) vertices = list(x.get_vertices(purpose="any")) Labels = x.get_labels(purpose="any") elif type(x) is Graph: vertices = list(x.get_vertices(purpose="any")) Labels = x.get_labels(purpose="any") else: raise TypeError('each element of X must be either ' 'a graph object or a list with at ' 'least a graph like object and ' 'node labels dict \n') # Hash based on the labels of fit new_labels = {v: self._labels_hash_dict.get(l, None) for v, l in iteritems(Labels)} # Radix sort the other g = ((vertices, new_labels) + ({n: x.neighbors(n, purpose="any") for n in vertices},)) gr = {0: self.NH_(g)} for r in range(1, self.R): gr[r] = self.NH_(gr[r-1]) # save the output for all levels out.append(gr) i += 1 if i == 0: raise ValueError('parsed input is empty') # Transform - calculate kernel matrix # Output is always normalized km = self._calculate_kernel_matrix(out) self._is_transformed = True return km
def parse_input(self, X): """Parse and create features for pyramid_match kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- H : list A list of lists of Histograms for all levels for each graph. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 Us = [] if self.with_labels: Ls = [] for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or (len(x) >= 1 and not self.with_labels) or (len(x) >= 2 and self.with_labels)): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif not self.with_labels: x = Graph(x[0], {}, {}, self._graph_format) else: x = Graph(x[0], x[1], {}, self._graph_format) elif not type(x) is Graph: raise TypeError( 'each element of X must be either a graph object or a list with ' 'at least a graph like object and node labels dict \n') A = x.get_adjacency_matrix() if self.with_labels: L = x.get_labels(purpose="adjacency") i += 1 if A.shape[0] == 0: Us.append(np.zeros((1, self.d))) else: # Perform eigenvalue decomposition. # Rows of matrix U correspond to vertex representations # Embed vertices into the d-dimensional space if A.shape[0] > self.d + 1: # If size of graph smaller than d, pad with zeros Lambda, U = eigs(csr_matrix(A, dtype=np.float), k=self.d, ncv=10 * self.d) idx = Lambda.argsort()[::-1] U = U[:, idx] else: Lambda, U = np.linalg.eig(A) idx = Lambda.argsort()[::-1] U = U[:, idx] U = U[:, :self.d] # Replace all components by their absolute values U = np.absolute(U) Us.append((A.shape[0], U)) if self.with_labels: Ls.append(L) if i == 0: raise ValueError('parsed input is empty') if self.with_labels: # Map labels to values between 0 and |L|-1 # where |L| is the number of distinct labels if self._method_calling in [1, 2]: self._num_labels = 0 self._labels = set() for L in Ls: self._labels |= set(itervalues(L)) self._num_labels = len(self._labels) self._labels = {l: i for (i, l) in enumerate(self._labels)} return self._histogram_calculation(Us, Ls, self._labels) elif self._method_calling == 3: labels = set() for L in Ls: labels |= set(itervalues(L)) rest_labels = labels - set(self._labels.keys()) nouveau_labels = dict( chain(iteritems(self._labels), ((j, i) for ( i, j) in enumerate(rest_labels, len(self._labels))))) return self._histogram_calculation(Us, Ls, nouveau_labels) else: return self._histogram_calculation(Us)
def parse_input(self, X): """Parse and create features for the propation kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- local_values : dict A dictionary of pairs between each input graph and a bins where the sampled graphlets have fallen. """ if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: i = -1 transition_matrix = dict() labels = set() L = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3, 4]: if len(x) == 0: warnings.warn('Ignoring empty element on ' + 'index: ' + str(idx)) continue if len(x) == 2 and type(x[0]) is Graph: g, T = x else: g = Graph(x[0], x[1], {}, self._graph_format) if len(x) == 4: T = x[3] else: T = None elif type(x) is Graph: g, T = x, None else: raise ValueError('Each element of X must be either a ' + 'Graph or an iterable with at least 2 ' + 'and at most 4 elements\n') if T is not None: if T.shape[0] != T.shape[1]: raise TypeError('Transition matrix on index' + ' ' + str(idx) + 'must be ' + 'a square matrix.') if T.shape[0] != g.nv(): raise TypeError('Propagation matrix must ' + 'have the same dimension ' + 'as the number of vertices.') else: T = g.get_adjacency_matrix() i += 1 transition_matrix[i] = (T.T / np.sum(T, axis=1)).T label = g.get_labels(purpose='adjacency') try: labels |= set(itervalues(label)) except TypeError: raise TypeError( 'For a non attributed kernel, labels should be hashable.' ) L.append((g.nv(), label)) if i == -1: raise ValueError('Parsed input is empty') # The number of parsed graphs n = i + 1 # enumerate labels if self._method_calling == 1: enum_labels = {l: i for (i, l) in enumerate(list(labels))} self._enum_labels = enum_labels self._parent_labels = labels elif self._method_calling == 3: new_elements = labels - self._parent_labels if len(new_elements) > 0: new_enum_labels = iter((l, i) for (i, l) in enumerate( list(new_elements), len(self._enum_labels))) enum_labels = dict( chain(iteritems(self._enum_labels), new_enum_labels)) else: enum_labels = self._enum_labels # make a matrix for all graphs that contains label vectors P, data, indexes = dict(), list(), [0] for (k, (nv, label)) in enumerate(L): data += [(indexes[-1] + j, enum_labels[label[j]]) for j in range(nv)] indexes.append(indexes[-1] + nv) # Initialise the on hot vector rows, cols = zip(*data) P = np.zeros(shape=(indexes[-1], len(enum_labels))) P[rows, cols] = 1 dim_orig = len(self._enum_labels) # feature vectors if self._method_calling == 1: # simple normal self._u, self._b, self._hd = list(), list(), list() for t in range(self.t_max): u = self.random_state_.randn(len(enum_labels)) if self.take_cauchy_: # cauchy u = np.divide( u, self.random_state_.randn(len(enum_labels))) self._u.append(u) # random offset self._b.append(self.w * self.random_state_.rand()) phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]) hd = dict( (j, i) for i, j in enumerate(set(np.unique(hashes)))) self._hd.append(hd) features = np.vectorize(lambda i: hd[i])(hashes) # Accumulate the results. for k in range(n): phi[k][t] = Counter(features[indexes[k]:indexes[k + 1]]) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)] elif (self._method_calling == 3 and dim_orig >= len(enum_labels)): phi = {k: dict() for k in range(n)} for t in range(self.t_max): # for hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P, self._u[t], self._b[t]) hd = dict( chain( iteritems(self._hd[t]), iter((j, i) for i, j in enumerate( filterfalse(lambda x: x in self._hd[t], np.unique(hashes)), len(self._hd[t]))))) features = np.vectorize(lambda i: hd[i])(hashes) # Accumulate the results. for k in range(n): phi[k][t] = Counter(features[indexes[k]:indexes[k + 1]]) # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) return [phi[k] for k in range(n)] else: cols = np.array(cols) vertices = np.where(cols < dim_orig)[0] vertices_p = np.where(cols >= dim_orig)[0] nnv = len(enum_labels) - dim_orig phi = {k: dict() for k in range(n)} for t in range(self.t_max): # hash all graphs inside P and produce the feature vectors hashes = self.calculate_LSH(P[vertices, :dim_orig], self._u[t], self._b[t]) hd = dict( chain( iteritems(self._hd[t]), iter((j, i) for i, j in enumerate( filterfalse(lambda x: x in self._hd[t], np.unique(hashes)), len(self._hd[t]))))) features = np.vectorize(lambda i: hd[i], otypes=[int])(hashes) # for each the new labels graph hash P and produce the feature vectors u = self.random_state_.randn(nnv) if self.take_cauchy_: # cauchy u = np.divide(u, self.random_state_.randn(nnv)) u = np.hstack((self._u[t], u)) # calculate hashes for the remaining hashes = self.calculate_LSH(P[vertices_p, :], u, self._b[t]) hd = dict( chain( iteritems(hd), iter((j, i) for i, j in enumerate(hashes, len(hd))))) features_p = np.vectorize(lambda i: hd[i], otypes=[int])(hashes) # Accumulate the results for k in range(n): A = Counter(features[np.logical_and( indexes[k] <= vertices, vertices <= indexes[k + 1])]) B = Counter(features_p[np.logical_and( indexes[k] <= vertices_p, vertices_p <= indexes[k + 1])]) phi[k][t] = A + B # calculate the Propagation matrix if needed if t < self.t_max - 1: for k in range(n): start, end = indexes[k:k + 2] P[start:end, :] = np.dot(transition_matrix[k], P[start:end, :]) Q = np.all(P[:, dim_orig:] > 0, axis=1) vertices = np.where(~Q)[0] vertices_p = np.where(Q)[0] return [phi[k] for k in range(n)]
def parse_input(self, X): """Parse input for weisfeiler lehman. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- base_kernel : object Returns base_kernel. """ if self._method_calling not in [1, 2]: raise ValueError('method call must be called either from fit ' + 'or fit-transform') elif hasattr(self, '_X_diag'): # Clean _X_diag value delattr(self, '_X_diag') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx = 0 Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: x.desired_format(self._graph_format) el = x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True) if el is None: extra = tuple() else: extra = (el, ) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") extras[nx] = extra distinct_values |= set(itervalues(L[nx])) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Save the number of "fitted" graphs. self._nx = nx # get all the distinct values of current labels WL_labels_inverse = dict() # assign a number to each label label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count label_count += 1 # Initalize an inverse dictionary of labels for all iterations self._inv_labels = dict() self._inv_labels[0] = WL_labels_inverse def generate_graphs(label_count, WL_labels_inverse): new_graphs = list() for j in range(nx): new_labels = dict() for k in L[j].keys(): new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels # add new labels new_graphs.append((Gs_ed[j], new_labels) + extras[j]) yield new_graphs for i in range(1, self._n_iter): label_set, WL_labels_inverse, L_temp = set(), dict(), dict() for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential label_set.add(credential) label_list = sorted(list(label_set)) for dv in label_list: WL_labels_inverse[dv] = label_count label_count += 1 # Recalculate labels new_graphs = list() for j in range(nx): new_labels = dict() for k in L_temp[j].keys(): new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels # relabel new_graphs.append((Gs_ed[j], new_labels) + extras[j]) self._inv_labels[i] = WL_labels_inverse yield new_graphs base_kernel = { i: self._base_kernel(**self._params) for i in range(self._n_iter) } if self._parallel is None: if self._method_calling == 1: for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse)): base_kernel[i].fit(g) elif self._method_calling == 2: K = np.sum( (base_kernel[i].fit_transform(g) for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse))), axis=0) else: if self._method_calling == 1: self._parallel( joblib.delayed(efit)(base_kernel[i], g) for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse))) elif self._method_calling == 2: K = np.sum(self._parallel( joblib.delayed(efit_transform)(base_kernel[i], g) for (i, g) in enumerate( generate_graphs(label_count, WL_labels_inverse))), axis=0) if self._method_calling == 1: return base_kernel elif self._method_calling == 2: return K, base_kernel
def parse_input(self, X): """Parse input for weisfeiler lehman optimal assignment. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- Hs : numpy array, shape = [n_input_graphs, hierarchy_size] An array where the rows contain the histograms of the graphs. """ if self._method_calling not in [1, 2]: raise ValueError('method call must be called either from fit ' + 'or fit-transform') elif hasattr(self, '_X_diag'): # Clean _X_diag value delattr(self, '_X_diag') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx = 0 Gs_ed, L, distinct_values = dict(), dict(), set() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") distinct_values |= set(itervalues(L[nx])) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Save the number of "fitted" graphs. self._nx = nx # Initialize hierarchy self._hierarchy = dict() self._hierarchy['root'] = dict() self._hierarchy['root']['parent'] = None self._hierarchy['root']['children'] = list() self._hierarchy['root']['w'] = 0 self._hierarchy['root']['omega'] = 0 # get all the distinct values of current labels WL_labels_inverse = dict() # assign a number to each label label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, 'root') label_count += 1 # Initalize an inverse dictionary of labels for all iterations self._inv_labels = dict() self._inv_labels[0] = WL_labels_inverse for j in range(nx): new_labels = dict() for k in L[j].keys(): new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels for i in range(1, self._n_iter): new_previous_label_set, WL_labels_inverse, L_temp = set(), dict(), dict() for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential new_previous_label_set.add((credential, L[j][v])) label_list = sorted(list(new_previous_label_set), key=lambda tup: tup[0]) for dv, previous_label in label_list: WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, previous_label) label_count += 1 # Recalculate labels for j in range(nx): new_labels = dict() for k in L_temp[j].keys(): new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels self._inv_labels[i] = WL_labels_inverse # Compute the vector representation of each graph if self.sparse: Hs = lil_matrix((nx, len(self._hierarchy))) else: Hs = np.zeros((nx, len(self._hierarchy))) for j in range(nx): for k in L[j].keys(): current_label = L[j][k] while self._hierarchy[current_label]['parent'] is not None: Hs[j, current_label] += self._hierarchy[current_label]['omega'] current_label = self._hierarchy[current_label]['parent'] return Hs
def parse_input(self, X): """Parse and create features for the NSPD kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- M : dict A dictionary with keys all the distances from 0 to self.d and values the the np.arrays with rows corresponding to the non-null input graphs and columns to the enumerations of tuples consisting of pairs of hash values and radius, from all the given graphs of the input (plus the fitted one's on transform). """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: # Hold the number of graphs ng = 0 # Holds all the data for combinations of r, d data = collections.defaultdict(dict) # Index all keys for combinations of r, d all_keys = collections.defaultdict(dict) for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 3]: if len(x) == 0: warnings.warn('Ignoring empty element' + ' on index: ' + str(idx)) continue else: g = Graph(x[0], x[1], x[2]) g.change_format("adjacency") elif type(x) is Graph: g = Graph( x.get_adjacency_matrix(), x.get_labels(purpose="adjacency", label_type="vertex"), x.get_labels(purpose="adjacency", label_type="edge")) else: raise TypeError('each element of X must have either ' + 'a graph with labels for node and edge ' + 'or 3 elements consisting of a graph ' + 'type object, labels for vertices and ' + 'labels for edges.') # Bring to the desired format g.change_format(self._graph_format) # Take the vertices vertices = set(g.get_vertices(purpose=self._graph_format)) # Extract the dicitionary ed = g.get_edge_dictionary() # Convert edges to tuples edges = {(j, k) for j in ed.keys() for k in ed[j].keys()} # Extract labels for nodes Lv = g.get_labels(purpose=self._graph_format) # and for edges Le = g.get_labels(purpose=self._graph_format, label_type="edge") # Produce all the neighborhoods and the distance pairs # up to the desired radius and maximum distance N, D, D_pair = g.produce_neighborhoods(self.r, purpose="dictionary", with_distances=True, d=self.d) # Hash all the neighborhoods H = self._hash_neighborhoods(vertices, edges, Lv, Le, N, D_pair) if self._method_calling == 1: for d in filterfalse(lambda x: x not in D, range(self.d + 1)): for (A, B) in D[d]: for r in range(self.r + 1): key = (H[r, A], H[r, B]) keys = all_keys[r, d] idx = keys.get(key, None) if idx is None: idx = len(keys) keys[key] = idx data[r, d][ng, idx] = data[r, d].get( (ng, idx), 0) + 1 elif self._method_calling == 3: for d in filterfalse(lambda x: x not in D, range(self.d + 1)): for (A, B) in D[d]: # Based on the edges of the bidirected graph for r in range(self.r + 1): keys = all_keys[r, d] fit_keys = self._fit_keys[r, d] key = (H[r, A], H[r, B]) idx = fit_keys.get(key, None) if idx is None: idx = keys.get(key, None) if idx is None: idx = len(keys) + len(fit_keys) keys[key] = idx data[r, d][ng, idx] = data[r, d].get( (ng, idx), 0) + 1 ng += 1 if ng == 0: raise ValueError('parsed input is empty') if self._method_calling == 1: # A feature matrix for all levels M = dict() for (key, d) in filterfalse(lambda a: len(a[1]) == 0, iteritems(data)): indexes, data = zip(*iteritems(d)) rows, cols = zip(*indexes) M[key] = csr_matrix((data, (rows, cols)), shape=(ng, len(all_keys[key])), dtype=np.int64) self._fit_keys = all_keys self._ngx = ng elif self._method_calling == 3: # A feature matrix for all levels M = dict() for (key, d) in filterfalse(lambda a: len(a[1]) == 0, iteritems(data)): indexes, data = zip(*iteritems(d)) rows, cols = zip(*indexes) M[key] = csr_matrix( (data, (rows, cols)), shape=(ng, len(all_keys[key]) + len(self._fit_keys[key])), dtype=np.int64) self._ngy = ng return M
def parse_input( self, X, ): """Parse input for weisfeiler lehman. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. return_embedding_only: bool Whether to return the embedding of the graphs only, instead of computing the kernel all the way to the end. Returns ------- base_graph_kernel : object Returns base_graph_kernel. """ if self._method_calling not in [1, 2]: raise ValueError('method call must be called either from fit ' + 'or fit-transform') elif hasattr(self, '_X_diag'): # Clean _X_diag value delattr(self, '_X_diag') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx = 0 Gs_ed, L, distinct_values, extras = dict(), dict(), set(), dict() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: x.desired_format(self._graph_format) el = x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True) if el is None: extra = tuple() else: extra = (el, ) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") extras[nx] = extra distinct_values |= set(itervalues(L[nx])) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Save the number of "fitted" graphs. self._nx = nx WL_labels_inverse = OrderedDict() # assign a number to each label label_count = 0 for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count label_count += 1 # Initalize an inverse dictionary of labels for all iterations self._inv_labels = OrderedDict( ) # Inverse dictionary of labels, in term of the *previous layer* self._inv_labels[0] = deepcopy(WL_labels_inverse) self.feature_dims.append( len(WL_labels_inverse)) # Update the zeroth iteration feature dim # self._inv_label_node_attr = OrderedDict() # Inverse dictionary of labels, in term of the *node attribute* # self._label_node_attr = OrderedDict() # Same as above, but with key and value inverted # self._label_node_attr[0], self._inv_label_node_attr[0] = self.translate_label(WL_labels_inverse, 0) # if self.node_weights is not None: # self._feature_weight = OrderedDict() # # Ensure the order is the same # self._feature_weight[0] = self._compute_feature_weight(self.node_weights, 0, WL_labels_inverse)[1] # else: # self._feature_weight = None def generate_graphs(label_count, WL_labels_inverse): new_graphs = list() for j in range(self._nx): new_labels = dict() for k in L[j].keys(): new_labels[k] = WL_labels_inverse[L[j][k]] L[j] = new_labels # add new labels new_graphs.append((Gs_ed[j], new_labels) + extras[j]) yield new_graphs for i in range(1, self._h): label_set, WL_labels_inverse, L_temp = set(), dict(), dict() for j in range(nx): # Find unique labels and sort # them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential label_set.add(credential) label_list = sorted(list(label_set)) for dv in label_list: WL_labels_inverse[dv] = label_count label_count += 1 # Recalculate labels new_graphs = list() for j in range(nx): new_labels = dict() for k in L_temp[j].keys(): new_labels[k] = WL_labels_inverse[L_temp[j][k]] L[j] = new_labels # relabel new_graphs.append((Gs_ed[j], new_labels) + extras[j]) self._inv_labels[i] = WL_labels_inverse # Compute the translated inverse node label # self._label_node_attr[i], self._inv_label_node_attr[i] = self.translate_label(WL_labels_inverse, i, self._label_node_attr[i - 1]) # self.feature_dims.append(self.feature_dims[-1] + len(self._label_node_attr[i])) # Compute the feature weight of the current layer # if self.node_weights is not None: # self._feature_weight[i] = self._compute_feature_weight(self.node_weights, i, self._inv_label_node_attr[i])[1] # assert len(self._feature_weight[i] == len(WL_labels_inverse)) yield new_graphs # Initialise the base graph kernel. base_graph_kernel = {} K = [] for (i, g) in enumerate(generate_graphs(label_count, WL_labels_inverse)): param = self._params # if self._feature_weight is not None: # print(self._feature_weight) # param.update({'mahalanobis_precision': self._feature_weight[i]}) base_graph_kernel.update({i: self._base_graph_kernel(**param)}) # if return_embedding_only: # K.append(base_graph_kernel[i].parse_input( # g, label_start_idx=self.feature_dims[i], label_end_idx=self.feature_dims[i + 1])) # else: if self._method_calling == 1: base_graph_kernel[i].fit(g, ) else: K.append(base_graph_kernel[i].fit_transform(g, )) # if return_embedding_only: # return K if self._method_calling == 1: return base_graph_kernel elif self._method_calling == 2: # if self.as_tensor: # K = torch.stack(K, dim=0).sum(dim=0) # return K, base_graph_kernel return np.sum(K, axis=0), base_graph_kernel
def parse_input(self, X): """Fast ML Graph Kernel. See supplementary material :cite:`kondor2016multiscale`, algorithm 1. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list A list of tuples with S matrices inverses and their 4th-root determinants. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: ng = 0 out = list() data = dict() neighborhoods = dict() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' 'graph or an iterable with at least 1 ' 'and at most 3 elements\n') phi_d = x.get_labels() A = x.get_adjacency_matrix() try: phi = np.array([list(phi_d[i]) for i in range(A.shape[0])]) except TypeError: raise TypeError('Features must be iterable and castable ' 'in total to a numpy array.') Lap = laplacian(A).astype(float) _increment_diagonal_(Lap, self.heta) data[ng] = {0: A, 1: phi, 2: inv(Lap)} neighborhoods[ng] = x ng += 1 if ng == 0: raise ValueError('parsed input is empty') # Define a function for calculating the S's of subgraphs of each iteration def calculate_C(k, j, l): if type(neighborhoods[k]) is Graph: neighborhoods[k] = neighborhoods[k].produce_neighborhoods( r=self.L, sort_neighbors=False) indexes = neighborhoods[k][l][j] L = laplacian(data[k][0][indexes, :][:, indexes]).astype(float) _increment_diagonal_(L, self.heta) U = data[k][1][indexes, :] S = multi_dot((U.T, inv(L), U)) _increment_diagonal_(S, self.gamma) return (inv(S), np.sum(np.log(np.real(eigvals(S))))) if self._method_calling == 1: V = [(k, j) for k in range(ng) for j in range(data[k][0].shape[0])] ns = min(len(V), self.n_samples) self.random_state_.shuffle(V) vs = V[:ns] phi_k = np.array([data[k][1][j, :] for (k, j) in vs]) # w the eigen vectors, v the eigenvalues K = phi_k.dot(phi_k.T) # Calculate eigenvalues v, w = eig(K) v, w = np.real(v), np.real(w.T) # keep only the positive vpos = np.argpartition(v, -self.P)[-self.P:] vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)] # ksi.shape = (k, Ns) * (Ns, P) ksi = w[vpos].dot(phi_k).T / np.sqrt(v[vpos]) for j in range(ng): # (n_samples, k) * (k, P) data[j][1] = data[j][1].dot(ksi) self._data_level = {0: ksi} for l in range(1, self.L + 1): # Take random samples from all the vertices of all graphs self.random_state_.shuffle(V) vs = V[:ns] # Compute the reference subsampled Gram matrix K_proj = { k: np.zeros(shape=(data[k][0].shape[0], ns)) for k in range(ng) } K, C = np.zeros(shape=(len(vs), len(vs))), dict() for (m, (k, j)) in enumerate(vs): C[m] = calculate_C(k, j, l) K_proj[k][j, m] = K[m, m] = self.pairwise_operation( C[m], C[m]) for (s, (k2, j2)) in enumerate(vs): if s < m: K[s, m] = K[m, s] \ = K_proj[k2][j2, m] \ = K_proj[k][j, s] \ = self.pairwise_operation(C[s], C[m]) else: break # Compute the kernels of the relations of the reference to everything else for (k, j) in V[ns:]: for (m, _) in enumerate(vs): K_proj[k][j, m] = self.pairwise_operation( C[m], calculate_C(k, j, l)) # w the eigen vectors, v the eigenvalues v, w = eig(K) v, w = np.real(v), np.real(w.T) # keep only the positive vpos = np.argpartition(v, -self.P)[-self.P:] vpos = vpos[np.where(v[vpos] > positive_eigenvalue_limit)] # Q shape=(k, P) Q = w[vpos].T / np.sqrt(v[vpos]) for j in range(ng): # (n, ns) * (ns, P) data[j][1] = K_proj[j].dot(Q) self._data_level[l] = (C, Q) elif self._method_calling == 3: ksi = self._data_level[0] for j in range(ng): # (n, k) * (k, P) data[j][1] = data[j][1].dot(ksi) for l in range(1, self.L + 1): C, Q = self._data_level[l] for j in range(ng): K_proj = np.zeros(shape=(data[j][0].shape[0], len(C))) for n in range(data[j][0].shape[0]): for m in range(len(C)): K_proj[n, m] = self.pairwise_operation( C[m], calculate_C(j, n, l)) data[j][1] = K_proj.dot(Q) # Apply the final calculation of S. for k in range(ng): S = multi_dot((data[k][1].T, data[k][2], data[k][1])) _increment_diagonal_(S, self.gamma) out.append((inv(S), np.sum(np.log(np.real(eigvals(S)))))) return out
def parse_input(self, X): """Parse and create features for multiscale_laplacian kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list Tuples consisting of the Adjacency matrix, phi, phi_outer dictionary of neihborhood indexes and inverse laplacians up to level self.L and the inverse Laplacian of A. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: ng = 0 out = list() start = time.time() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter, x = True, list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element ' + 'on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: x.desired_format(self._graph_format) else: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 1 ' + 'and at most 3 elements\n') ng += 1 phi_d = x.get_labels() A = x.get_adjacency_matrix() N = x.produce_neighborhoods(r=self.L, sort_neighbors=False) try: phi = np.array([list(phi_d[i]) for i in range(A.shape[0])]) except TypeError: raise TypeError('Features must be iterable and castable ' + 'in total to a numpy array.') phi_outer = np.dot(phi, phi.T) Lap = laplacian(A).astype(float) _increment_diagonal_(Lap, self.heta) L = inv(Lap) Q = dict() for level in range(1, self.L + 1): Q[level] = dict() for (key, item) in iteritems(N[level]): Q[level][key] = dict() Q[level][key]["n"] = np.array(item) if len(item) < A.shape[0]: laplac = laplacian(A[item, :][:, item]).astype(float) _increment_diagonal_(laplac, self.heta) laplac = inv(laplac) else: laplac = L Q[level][key]["l"] = laplac out.append((A, phi, phi_outer, Q, L)) if self.verbose: print("Preprocessing took:", time.time() - start, "s.") if ng == 0: raise ValueError('parsed input is empty') return out
def parse_input(self, X): """Parse input and create features, while initializing and/or calculating sub-kernels. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- base_graph_kernel : object Returns base_graph_kernel. Only if called from `fit` or `fit_transform`. K : np.array Returns the kernel matrix. Only if called from `transform` or `fit_transform`. """ if self.base_graph_kernel_ is None: raise ValueError('User must provide a base_graph_kernel') # Input validation and parsing if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: nx, labels = 0, list() if self._method_calling in [1, 2]: nl, labels_enum, base_graph_kernel = 0, dict(), dict() for kidx in range(self.n_iter): base_graph_kernel[kidx] = self.base_graph_kernel_[0]( **self.base_graph_kernel_[1]) elif self._method_calling == 3: nl, labels_enum, base_graph_kernel = len( self._labels_enum), dict(self._labels_enum), self.X inp = list() neighbors = list() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): x, is_iter = list(x), True if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue else: if len(x) > 2: extra = tuple() if len(x) > 3: extra = tuple(x[3:]) x = Graph(x[0], x[1], x[2], graph_format=self._graph_format) extra = (x.get_labels(purpose='any', label_type="edge", return_none=True), ) + extra else: x = Graph(x[0], x[1], {}, graph_format=self._graph_format) extra = tuple() elif type(x) is Graph: el = x.get_labels(purpose=self._graph_format, label_type="edge", return_none=True) if el is None: extra = tuple() else: extra = (el, ) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node labels ' + 'dict \n') label = x.get_labels(purpose='any') inp.append((x.get_graph_object(), extra)) neighbors.append(x.get_edge_dictionary()) labels.append(label) for v in set(itervalues(label)): if v not in labels_enum: labels_enum[v] = nl nl += 1 nx += 1 if nx == 0: raise ValueError('parsed input is empty') # Calculate the hadamard matrix H = hadamard(int(2**(ceil(log2(nl))))) def generate_graphs(labels): # Intial labeling of vertices based on their corresponding Hadamard code (i-th row of the # Hadamard matrix) where i is the i-th label on enumeration new_graphs, new_labels = list(), list() for ((obj, extra), label) in zip(inp, labels): new_label = dict() for (k, v) in iteritems(label): new_label[k] = H[labels_enum[v], :] new_graphs.append( (obj, {i: tuple(j) for (i, j) in iteritems(new_label)}) + extra) new_labels.append(new_label) yield new_graphs # Main for i in range(1, self.n_iter): new_graphs, labels, new_labels = list(), new_labels, list() for ((obj, extra), neighbor, old_label) in zip(inp, neighbors, labels): # Find unique labels and sort them for both graphs and keep for each node # the temporary new_label = dict() for (k, ns) in iteritems(neighbor): new_label[k] = old_label[k] for q in ns: new_label[k] = np.add(new_label[k], old_label[q]) new_labels.append(new_label) new_graphs.append( (obj, {i: tuple(j) for (i, j) in iteritems(new_label)}) + extra) yield new_graphs if self._method_calling in [1, 2]: base_graph_kernel = { i: self.base_graph_kernel_[0](**self.base_graph_kernel_[1]) for i in range(self.n_iter) } if self._parallel is None: # Add the zero iteration element if self._method_calling == 1: for (i, g) in enumerate(generate_graphs(labels)): base_graph_kernel[i].fit(g) elif self._method_calling == 2: K = np.sum((base_graph_kernel[i].fit_transform(g) for (i, g) in enumerate(generate_graphs(labels))), axis=0) elif self._method_calling == 3: # Calculate the kernel matrix without parallelization K = np.sum((self.X[i].transform(g) for (i, g) in enumerate(generate_graphs(labels))), axis=0) else: if self._method_calling == 1: self._parallel( joblib.delayed(efit)(base_graph_kernel[i], g) for (i, g) in enumerate(generate_graphs(labels))) elif self._method_calling == 2: # Calculate the kernel marix with parallelization K = np.sum(self._parallel( joblib.delayed(efit_transform)(base_graph_kernel[i], g) for (i, g) in enumerate(generate_graphs(labels))), axis=0) elif self._method_calling == 3: # Calculate the kernel marix with parallelization K = np.sum(self._parallel( joblib.delayed(etransform)(self.X[i], g) for (i, g) in enumerate(generate_graphs(labels))), axis=0) if self._method_calling == 1: self._labels_enum = labels_enum return base_graph_kernel elif self._method_calling == 2: self._labels_enum = labels_enum return K, base_graph_kernel elif self._method_calling == 3: return K
def parse_input(self, X): """Parse and create features for graphlet_sampling kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list The extracted adjacency matrices for any given input. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 proc = list() for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [1, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element' + ' on index: ' + str(idx)) continue else: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: raise TypeError('each element of X must be either a ' + 'graph or an iterable with at least 2 ' + 'and at most 3 elements\n') i += 1 x.desired_format("adjacency") Ax = x.get_adjacency_matrix() Lx = x.get_labels(purpose="adjacency") Lx = [Lx[idx] for idx in range(Ax.shape[0])] proc.append((Ax, Lx, Ax.shape[0])) out = list() for Ax, Lx, s in proc: amss = dict() labels = set(Lx) Lx = np.array(Lx) for t in product(labels, labels): selector = np.matmul(np.expand_dims(Lx == t[0], axis=1), np.expand_dims(Lx == t[1], axis=0)) amss[t] = Ax * selector out.append((amss, s)) if i == 0: raise ValueError('parsed input is empty') return out
def transform(self, X): """Calculate the kernel matrix, between given and fitted dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. Returns ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 3 # Check is fit had been called check_is_fitted(self, ['X', '_nx', '_inv_labels']) # Input validation and parsing if X is None: raise ValueError('transform input cannot be None') else: if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: nx = 0 distinct_values = set() Gs_ed, L = dict(), dict() for (i, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(i)) continue elif len(x) in [2, 3]: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format("dictionary") else: raise ValueError('each element of X must have at ' + 'least one and at most 3 elements\n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") # Hold all the distinct values distinct_values |= set(v for v in itervalues(L[nx]) if v not in self._inv_labels[0]) nx += 1 if nx == 0: raise ValueError('parsed input is empty') nl = len(self._inv_labels[0]) WL_labels_inverse = { dv: idx for (idx, dv) in enumerate(sorted(list(distinct_values)), nl) } def generate_graphs(WL_labels_inverse, nl): # calculate the kernel matrix for the 0 iteration new_graphs = list() for j in range(nx): new_labels = dict() for (k, v) in iteritems(L[j]): if v in self._inv_labels[0]: new_labels[k] = self._inv_labels[0][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels # produce the new graphs new_graphs.append([Gs_ed[j], new_labels]) yield new_graphs for i in range(1, self._n_iter): new_graphs = list() L_temp, label_set = dict(), set() nl += len(self._inv_labels[i]) for j in range(nx): # Find unique labels and sort them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential if credential not in self._inv_labels[i]: label_set.add(credential) # Calculate the new label_set WL_labels_inverse = dict() if len(label_set) > 0: for dv in sorted(list(label_set)): idx = len(WL_labels_inverse) + nl WL_labels_inverse[dv] = idx # Recalculate labels new_graphs = list() for j in range(nx): new_labels = dict() for (k, v) in iteritems(L_temp[j]): if v in self._inv_labels[i]: new_labels[k] = self._inv_labels[i][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels # Create the new graphs with the new labels. new_graphs.append([Gs_ed[j], new_labels]) yield new_graphs if self._parallel is None: # Calculate the kernel matrix without parallelization K = np.sum( (self.X[i].transform(g) for (i, g) in enumerate(generate_graphs(WL_labels_inverse, nl))), axis=0) else: # Calculate the kernel marix with parallelization K = np.sum(self._parallel( joblib.delayed(etransform)(self.X[i], g) for (i, g) in enumerate(generate_graphs(WL_labels_inverse, nl))), axis=0) self._is_transformed = True if self.normalize: X_diag, Y_diag = self.diagonal() old_settings = np.seterr(divide='ignore') K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) np.seterr(**old_settings) return K
def parse_input(self, X): """Parse and create features for the `subgraph_matching` kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : list The extracted adjacency matrices for any given input. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 out = list() for (idx, x) in enumerate(iter(X)): is_iter = False if isinstance(x, collections.Iterable): is_iter = True x = list(x) if type(x) is Graph: g = Graph( x.get_adjacency_matrix(), x.get_labels(purpose="adjacency"), x.get_labels(purpose="adjacency", label_type="edge"), self._graph_format) elif is_iter and len(x) in [0, 3]: x = list(x) if len(x) == 0: warnings.warn('Ignoring empty element' + ' on index: ' + str(idx)) continue elif len(x) == 3: g = Graph(x[0], x[1], x[2], "adjacency") g.change_format(self._graph_format) else: raise TypeError('each element of X must be either a ' + 'graph object or a list with at least ' + 'a graph like object and node, ' + 'edge labels dict \n') n = g.nv() E = g.get_edge_dictionary() L = g.get_labels(purpose="dictionary", return_none=(self.kv is None)) Le = g.get_labels(purpose="dictionary", label_type="edge", return_none=(self.ke is None)) Er = set( (a, b) for a in E.keys() for b in E[a].keys() if a != b) i += 1 out.append((n, Er, L, Le)) if i == 0: raise ValueError('parsed input is empty') return out
def transform(self, X): """Calculate the kernel matrix, between given and fitted dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. Returns ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 3 # Check is fit had been called check_is_fitted(self, ['X', '_nx', '_hierarchy', '_inv_labels']) # Input validation and parsing if X is None: raise ValueError('transform input cannot be None') else: if not isinstance(X, collections.Iterable): raise ValueError('input must be an iterable\n') else: nx = 0 distinct_values = set() Gs_ed, L = dict(), dict() for (i, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and len(x) in [0, 2, 3]: if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(i)) continue elif len(x) in [2, 3]: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is Graph: x.desired_format("dictionary") else: raise ValueError('each element of X must have at ' + 'least one and at most 3 elements\n') Gs_ed[nx] = x.get_edge_dictionary() L[nx] = x.get_labels(purpose="dictionary") # Hold all the distinct values distinct_values |= set( v for v in itervalues(L[nx]) if v not in self._inv_labels[0]) nx += 1 if nx == 0: raise ValueError('parsed input is empty') # get all the distinct values of new labels WL_labels_inverse = dict() # assign a number to each label label_count = sum([len(self._inv_labels[i]) for i in range(len(self._inv_labels))]) for dv in sorted(list(distinct_values)): WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, 'root') label_count += 1 for j in range(nx): new_labels = dict() for (k, v) in iteritems(L[j]): if v in self._inv_labels[0]: new_labels[k] = self._inv_labels[0][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels for i in range(1, self._n_iter): L_temp, new_previous_label_set = dict(), set() for j in range(nx): # Find unique labels and sort them for both graphs # Keep for each node the temporary L_temp[j] = dict() for v in Gs_ed[j].keys(): credential = str(L[j][v]) + "," + \ str(sorted([L[j][n] for n in Gs_ed[j][v].keys()])) L_temp[j][v] = credential if credential not in self._inv_labels[i]: new_previous_label_set.add((credential, L[j][v])) # Calculate the new label_set WL_labels_inverse = dict() if len(new_previous_label_set) > 0: for dv, previous_label in sorted(list(new_previous_label_set), key=lambda tup: tup[0]): WL_labels_inverse[dv] = label_count self._insert_into_hierarchy(label_count, previous_label) label_count += 1 # Recalculate labels for j in range(nx): new_labels = dict() for (k, v) in iteritems(L_temp[j]): if v in self._inv_labels[i]: new_labels[k] = self._inv_labels[i][v] else: new_labels[k] = WL_labels_inverse[v] L[j] = new_labels # Compute the vector representation of each graph if self.sparse: Hs = lil_matrix((nx, len(self._hierarchy))) else: Hs = np.zeros((nx, len(self._hierarchy))) for j in range(nx): for k in L[j].keys(): current_label = L[j][k] while self._hierarchy[current_label]['parent'] is not None: Hs[j, current_label] += self._hierarchy[current_label]['omega'] current_label = self._hierarchy[current_label]['parent'] self.Y = Hs # Compute the histogram intersection kernel K = np.zeros((nx, self._nx)) if self.sparse: for i in range(self._nx): for j in range(i, self._nx): K[i, j] = np.sum(Hs[i, :self.X.shape[1]].minimum(self.X[j, :])) else: for i in range(nx): for j in range(self._nx): K[i, j] = np.sum(np.min([Hs[i, :self.X.shape[1]], self.X[j, :]], axis=0)) self._is_transformed = True if self.normalize: X_diag, Y_diag = self.diagonal() old_settings = np.seterr(divide='ignore') K = np.nan_to_num(np.divide(K, np.sqrt(np.outer(Y_diag, X_diag)))) np.seterr(**old_settings) return K