Ejemplo n.º 1
0
 def _train(self, trainset):
     self._dataset = trainset
     self.ulabels = trainset.uniquelabels
     # Do cross-validation for normal classifier
     self.cvterr = CrossValidatedTransferError(TransferError(self._clf),
                                               self._splitter,
                                               enable_states=["confusion"])
     self.cvterr(self._dataset)
     # From the confusion matrix, calculate linkage and tree-structure
     # First prepare distance matrix from confusion matrix
     dist = self.cvterr.confusion.matrix
     dist = dist.max(
     ) - dist  # Kind of inversion. High values in confusion -> similar -> small distance
     dist = (dist +
             dist.T) / 2  # Distance must be symmetric (property of a norm)
     dist -= np.diag(
         np.diag(dist)
     )  # Distance to self must be zero -> make diagonal elements zero
     # Calculate linkage matrix
     self.linkage = hcluster.linkage(hcluster.squareform(dist))
     # Build tree and according TreeClassifier
     self.tree = hcluster.to_tree(self.linkage)
     self._tree_clf = self.build_tree_classifier_from_linkage_tree(
         self.tree)[0]
     self._tree_clf.train(trainset)
Ejemplo n.º 2
0
def get_clustering_as_tree(vectors, ward = True, clustering_distance='euclidean', clustering_method = 'complete', progress = progress):
    if ward:
        progress.update('Clustering data with Ward linkage and euclidean distances')
        clustering_result = hcluster.ward(vectors)
    else:
        progress.update('Computing distance matrix using "%s" distance' % clustering_distance)
        distance_matrix = hcluster.pdist(vectors, clustering_distance)
        progress.update('Clustering data with "%s" linkage' % clustering_method)
        clustering_result = hcluster.linkage(distance_matrix, method = clustering_method)

    progress.update('Returning results')
    return hcluster.to_tree(clustering_result)
    def __init__(self, similarity_dict):
        self.sim = similarity_dict
        self.keys = []
        self.get_keys()

        self.calculate_distance_matrix()

        self.cluster_method = self.get_cluster_method()

        self.Z = self.cluster_method(self.distance_matrix)
        self.node = hcluster.to_tree(self.Z)
        self.print_node(self.node, self.keys)

        self.image_file = None
Ejemplo n.º 4
0
    def __init__(self, similarity_dict):
        self.sim = similarity_dict
        self.keys = []
        self.get_keys()

        self.calculate_distance_matrix()

        self.cluster_method = self.get_cluster_method()

        self.Z = self.cluster_method(self.distance_matrix)
        self.node = hcluster.to_tree(self.Z)
        self.print_node(self.node, self.keys)

        self.image_file = None
Ejemplo n.º 5
0
def cluster(items, cache_clustering_file = None, dist_fn = euc_dist, \
    prefix_output = None):

    if not cache_clustering_file:
        print "Generating distance matrix..."
        sys.stdout.flush()
        Y = dist_matrix(items, dist_fn)

        print "Linkage clustering..."
        sys.stdout.flush()
        Z = linkage(Y, "single")  # average, complete = max, single = min ?

        print "Dumping clustering information into cache file"
        sys.stdout.flush()
        cPickle.dump([Y, Z], open(prefix_output + "clustering_dump.pkl", "w"))

    else:
        print "Loading clustering cache from '%s'" % cache_clustering_file.name
        Y, Z = cPickle.load(cache_clustering_file)

    print "Converting into ETE tree..."
    sys.stdout.flush()
    T = to_tree(Z)

    root = Tree()
    root.dist = 0
    root.name = "root"
    item2node = {T: root}

    to_visit = [T]
    while to_visit:
        node = to_visit.pop()
        cl_dist = node.dist / 2.0
        for ch_node in [node.left, node.right]:
            if ch_node:
                ch = Tree()
                #try:
                #  ch.add_features(content = str(items[ch_node.id]))
                #except IndexError:
                #  pass
                ch.dist = cl_dist
                ch.name = str(ch_node.id)
                item2node[node].add_child(ch)
                item2node[ch_node] = ch
                to_visit.append(ch_node)

    return root
Ejemplo n.º 6
0
 def _train(self, dataset):
     self._dataset = dataset
     self.ulabels=self._dataset.uniquelabels
     # Do cross-validation for normal classifier
     self.cvterr = CrossValidatedTransferError(TransferError(self._clf),self._splitter,enable_states=["confusion"])
     self.cvterr(self._dataset)
     # From the confusion matrix, calculate linkage and tree-structure
     # First prepare distance matrix from confusion matrix
     dist = self.cvterr.confusion.matrix
     dist = (dist+dist.T)/2 # Distance must be symmetric (property of a norm)
     dist = dist.max()-dist # Kind of inversion. High values in confusion -> similar -> small distance
     dist -= np.diag(np.diag(dist)) # Distance to self must be zero -> make diagonal elements zero
     # Calculate linkage matrix
     self.linkage = hcluster.linkage(hcluster.squareform(dist))
     # Build tree and according TreeClassifier
     self.tree = hcluster.to_tree(self.linkage)
     self._tree_clf = self.build_tree_classifier_from_linkage_tree(self.tree)[0]
     self._tree_clf.train(self._dataset)
Ejemplo n.º 7
0
def get_clustering_as_tree(vectors,
                           ward=True,
                           clustering_distance='euclidean',
                           clustering_method='complete',
                           progress=progress):
    if ward:
        progress.update(
            'Clustering data with Ward linkage and euclidean distances')
        clustering_result = hcluster.ward(vectors)
    else:
        progress.update('Computing distance matrix using "%s" distance' %
                        clustering_distance)
        distance_matrix = hcluster.pdist(vectors, clustering_distance)
        progress.update('Clustering data with "%s" linkage' %
                        clustering_method)
        clustering_result = hcluster.linkage(distance_matrix,
                                             method=clustering_method)

    progress.update('Returning results')
    return hcluster.to_tree(clustering_result)
Ejemplo n.º 8
0
def cluster_motifs(total_ranks_pickle, output_pickle, motif_dir='tmp/motif_dir/'):
	"""
	Input should be a total_ranks pickle outputted from cluster4
	where it's a sorted list of (rank_index,fam,count,motif_filename,data,count_o,motif_size)s
	"""
	from hcluster import linkage, to_tree
	motifs = []
	distance = []
	total_ranks = load(open(total_ranks_pickle))
	for rank_index,fam,count,motif_filename,data,count_o,motif_size in total_ranks:
		if rank_index < 300:
			continue # TODO:delete or change later?
		motifs.append({'file': motif_filename, 'info': "{0}_{1}".format(fam,count)})
	print >> sys.stderr, "comparing {0} motifs.....".format(len(motifs))
	for i in xrange(len(motifs)):
		print >> sys.stderr, "{0}/{1}".format(i,len(motifs))
		for j in xrange(0,i):
			f1 = motifs[i]['file']
			f2 = motifs[j]['file']
			f1 = motif_dir + f1[:f1.find('.fna')] + '/' + f1
			f2 = motif_dir + f2[:f2.find('.fna')] + '/' + f2
			score = compare_motif(f1, f2)['score']
			# the bigger the score (more similar), the smaller their distance
			# so we invert the score to 10000/score achieve that
			# however be careful that score of -9999999 means we should give it
			# a distance of...say...999999?
			if score > 0:
				distance.append( 10000./score )
			else:
				distance.append( 999999. )
	Z = linkage(distance, method='average')
	# delete below later
	f = open(output_pickle,'w')
	dump({'motifs':motifs,'Z':Z}, f)
	f.close()
	# delete above later
	# now Z is the linkage matrix, convert it to newick with proper naming
	root = to_tree(Z)
	print(hcluster_cnode_to_newick(root, motifs))
Ejemplo n.º 9
0
rowHeaders[N] = clase
npTmp = np.array(dataTmp)
dataMatrix.append(np.average(npTmp, axis=0))

print "Se cargaron los datos correctamente ..."
print "Calculando la matriz de distancia ..."
dataMatrix = np.array(dataMatrix)
distanceMatrix = pdist(dataMatrix, "mahalanobis")

print "Iniciando linkage. "
linkageMatrix = fastcluster.linkage(distanceMatrix, "complete")

# plot_with_labels(linkageMatrix, 4)

print "Convirtiendo arbol generado en estructura ETE ..."
tree = to_tree(linkageMatrix)
# Ete section
eimera = Tree()
eimera.dist = 0
eimera.name = "eimera"
item2node = {tree: eimera}

to_visit = [tree]

while to_visit:
    node = to_visit.pop()
    cl_dist = node.dist / 2.0
    for ch_node in [node.left, node.right]:
        if ch_node:
            ch = Tree()
            ch.dist = cl_dist