Esempio n. 1
0
def lattice_spanning(data, min_common=10):

    if isinstance(data, SuffixArray):
        suffix_array = data
        data = suffix_array.data
    else:
        suffix_array = SuffixArray(data)

    f = range(len(data))

    def find(x):
        if x == f[x]:
            return x
        f[x] = find(f[x])
        return f[x]

    for conn in suffix_array.connectivity(min_common):
        for i in xrange(conn.start + 1, conn.stop):
            a = suffix_array.G[suffix_array[i - 1]]
            b = suffix_array.G[suffix_array[i]]
            f[find(a)] = find(b)

    m = [[] for x in xrange(len(data))]
    for i in xrange(len(f)):
        m[find(i)].append(data[i])
    all = filter(lambda x: len(x) != 0, m)
    return all
Esempio n. 2
0
def lattice_spanning(data, min_common = 10):

	if isinstance(data, SuffixArray):
		suffix_array = data
		data = suffix_array.data
	else:
		suffix_array = SuffixArray(data)

	f = range(len(data))
	def find(x):
		if x == f[x]:
			return x
		f[x] = find(f[x])
		return f[x]
	for conn in suffix_array.connectivity(min_common):
		for i in xrange(conn.start + 1, conn.stop):
			a = suffix_array.G[suffix_array[i-1]]
			b = suffix_array.G[suffix_array[i]]
			f[find(a)] = find(b)
	
	m = [[] for x in xrange(len(data))]
	for i in xrange(len(f)):
		m[find(i)].append(data[i])
	all = filter(lambda x: len(x) != 0, m)
	return all
Esempio n. 3
0
def dense_spanning(data, min_common = 8):
	
	if isinstance(data, SuffixArray):
		suffix_array = data
		data = suffix_array.data
	else:
		suffix_array = SuffixArray(data)
	
	graph = suffix_array.similarity_graph()
	eps = 2 ** min_common
	graph = filter(lambda x: x[1] >= eps, graph)
	ind = spanning_tree(graph, len(data))
	return [map(lambda i: data[i], row) for row in ind]
Esempio n. 4
0
def dense_spanning(data, min_common=8):

    if isinstance(data, SuffixArray):
        suffix_array = data
        data = suffix_array.data
    else:
        suffix_array = SuffixArray(data)

    graph = suffix_array.similarity_graph()
    eps = 2**min_common
    graph = filter(lambda x: x[1] >= eps, graph)
    ind = spanning_tree(graph, len(data))
    return [map(lambda i: data[i], row) for row in ind]
Esempio n. 5
0
def double_cluster(data,
                   min_common=3,
                   step=1,
                   eps=0.4,
                   leaf_size=60,
                   algorithm='lattice',
                   heirarchy=True):

    # immediately instantiate into suffix array for later
    if isinstance(data, SuffixArray):
        suffix_array = data
        data = suffix_array.data
    else:
        # base case: check first if singleton
        if len(data) == 1: return data
        suffix_array = SuffixArray(data)

    # draft initial clusters
    if algorithm == 'lattice':
        draft = lattice_spanning(suffix_array, min_common)

    elif algorithm == 'dense':
        draft = dense_spanning(suffix_array, min_common)

    if len(draft) == 1:
        # special case
        # no valid clustering found using current parameters
        # reuse suffix array to avoid recomputation
        return double_cluster(suffix_array, min_common + step, step, eps,
                              leaf_size, algorithm, heirarchy)

    final_clustering = []

    for subcluster in draft:
        # check first if subcluster is expandable or not
        if len(subcluster) <= leaf_size and edit_radius(subcluster,
                                                        eps) <= eps:
            # subcluster is ok, no need for expansion
            final_clustering.append(subcluster)

        else:
            # subcluster can still be expanded
            expanded = double_cluster(subcluster, min_common + step, step, eps,
                                      leaf_size, algorithm, heirarchy)
            if heirarchy: final_clustering.append(expanded)
            else: final_clustering.extend(expanded)

    # keep big clusters in front
    return sorted(final_clustering, key=len, reverse=True)
Esempio n. 6
0
def spanning_forest(data, n_clusters=2):

    graph = data.similarity_graph() if isinstance(
        data, SuffixArray) else SuffixArray(data).similarity_graph()
    ind = spanning_tree(graph, len(data), n_clusters)
    return [map(lambda i: data[i], row) for row in ind]
Esempio n. 7
0
 def __init__(self):
     self.sfxFunc = SuffixArray().main_2