def lattice_spanning(data, min_common=10): if isinstance(data, SuffixArray): suffix_array = data data = suffix_array.data else: suffix_array = SuffixArray(data) f = range(len(data)) def find(x): if x == f[x]: return x f[x] = find(f[x]) return f[x] for conn in suffix_array.connectivity(min_common): for i in xrange(conn.start + 1, conn.stop): a = suffix_array.G[suffix_array[i - 1]] b = suffix_array.G[suffix_array[i]] f[find(a)] = find(b) m = [[] for x in xrange(len(data))] for i in xrange(len(f)): m[find(i)].append(data[i]) all = filter(lambda x: len(x) != 0, m) return all
def lattice_spanning(data, min_common = 10): if isinstance(data, SuffixArray): suffix_array = data data = suffix_array.data else: suffix_array = SuffixArray(data) f = range(len(data)) def find(x): if x == f[x]: return x f[x] = find(f[x]) return f[x] for conn in suffix_array.connectivity(min_common): for i in xrange(conn.start + 1, conn.stop): a = suffix_array.G[suffix_array[i-1]] b = suffix_array.G[suffix_array[i]] f[find(a)] = find(b) m = [[] for x in xrange(len(data))] for i in xrange(len(f)): m[find(i)].append(data[i]) all = filter(lambda x: len(x) != 0, m) return all
def dense_spanning(data, min_common = 8): if isinstance(data, SuffixArray): suffix_array = data data = suffix_array.data else: suffix_array = SuffixArray(data) graph = suffix_array.similarity_graph() eps = 2 ** min_common graph = filter(lambda x: x[1] >= eps, graph) ind = spanning_tree(graph, len(data)) return [map(lambda i: data[i], row) for row in ind]
def dense_spanning(data, min_common=8): if isinstance(data, SuffixArray): suffix_array = data data = suffix_array.data else: suffix_array = SuffixArray(data) graph = suffix_array.similarity_graph() eps = 2**min_common graph = filter(lambda x: x[1] >= eps, graph) ind = spanning_tree(graph, len(data)) return [map(lambda i: data[i], row) for row in ind]
def double_cluster(data, min_common=3, step=1, eps=0.4, leaf_size=60, algorithm='lattice', heirarchy=True): # immediately instantiate into suffix array for later if isinstance(data, SuffixArray): suffix_array = data data = suffix_array.data else: # base case: check first if singleton if len(data) == 1: return data suffix_array = SuffixArray(data) # draft initial clusters if algorithm == 'lattice': draft = lattice_spanning(suffix_array, min_common) elif algorithm == 'dense': draft = dense_spanning(suffix_array, min_common) if len(draft) == 1: # special case # no valid clustering found using current parameters # reuse suffix array to avoid recomputation return double_cluster(suffix_array, min_common + step, step, eps, leaf_size, algorithm, heirarchy) final_clustering = [] for subcluster in draft: # check first if subcluster is expandable or not if len(subcluster) <= leaf_size and edit_radius(subcluster, eps) <= eps: # subcluster is ok, no need for expansion final_clustering.append(subcluster) else: # subcluster can still be expanded expanded = double_cluster(subcluster, min_common + step, step, eps, leaf_size, algorithm, heirarchy) if heirarchy: final_clustering.append(expanded) else: final_clustering.extend(expanded) # keep big clusters in front return sorted(final_clustering, key=len, reverse=True)
def spanning_forest(data, n_clusters=2): graph = data.similarity_graph() if isinstance( data, SuffixArray) else SuffixArray(data).similarity_graph() ind = spanning_tree(graph, len(data), n_clusters) return [map(lambda i: data[i], row) for row in ind]
def __init__(self): self.sfxFunc = SuffixArray().main_2