def _train(self, trainset): self._dataset = trainset self.ulabels = trainset.uniquelabels # Do cross-validation for normal classifier self.cvterr = CrossValidatedTransferError(TransferError(self._clf), self._splitter, enable_states=["confusion"]) self.cvterr(self._dataset) # From the confusion matrix, calculate linkage and tree-structure # First prepare distance matrix from confusion matrix dist = self.cvterr.confusion.matrix dist = dist.max( ) - dist # Kind of inversion. High values in confusion -> similar -> small distance dist = (dist + dist.T) / 2 # Distance must be symmetric (property of a norm) dist -= np.diag( np.diag(dist) ) # Distance to self must be zero -> make diagonal elements zero # Calculate linkage matrix self.linkage = hcluster.linkage(hcluster.squareform(dist)) # Build tree and according TreeClassifier self.tree = hcluster.to_tree(self.linkage) self._tree_clf = self.build_tree_classifier_from_linkage_tree( self.tree)[0] self._tree_clf.train(trainset)
def get_clustering_as_tree(vectors, ward = True, clustering_distance='euclidean', clustering_method = 'complete', progress = progress): if ward: progress.update('Clustering data with Ward linkage and euclidean distances') clustering_result = hcluster.ward(vectors) else: progress.update('Computing distance matrix using "%s" distance' % clustering_distance) distance_matrix = hcluster.pdist(vectors, clustering_distance) progress.update('Clustering data with "%s" linkage' % clustering_method) clustering_result = hcluster.linkage(distance_matrix, method = clustering_method) progress.update('Returning results') return hcluster.to_tree(clustering_result)
def __init__(self, similarity_dict): self.sim = similarity_dict self.keys = [] self.get_keys() self.calculate_distance_matrix() self.cluster_method = self.get_cluster_method() self.Z = self.cluster_method(self.distance_matrix) self.node = hcluster.to_tree(self.Z) self.print_node(self.node, self.keys) self.image_file = None
def cluster(items, cache_clustering_file = None, dist_fn = euc_dist, \ prefix_output = None): if not cache_clustering_file: print "Generating distance matrix..." sys.stdout.flush() Y = dist_matrix(items, dist_fn) print "Linkage clustering..." sys.stdout.flush() Z = linkage(Y, "single") # average, complete = max, single = min ? print "Dumping clustering information into cache file" sys.stdout.flush() cPickle.dump([Y, Z], open(prefix_output + "clustering_dump.pkl", "w")) else: print "Loading clustering cache from '%s'" % cache_clustering_file.name Y, Z = cPickle.load(cache_clustering_file) print "Converting into ETE tree..." sys.stdout.flush() T = to_tree(Z) root = Tree() root.dist = 0 root.name = "root" item2node = {T: root} to_visit = [T] while to_visit: node = to_visit.pop() cl_dist = node.dist / 2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() #try: # ch.add_features(content = str(items[ch_node.id])) #except IndexError: # pass ch.dist = cl_dist ch.name = str(ch_node.id) item2node[node].add_child(ch) item2node[ch_node] = ch to_visit.append(ch_node) return root
def _train(self, dataset): self._dataset = dataset self.ulabels=self._dataset.uniquelabels # Do cross-validation for normal classifier self.cvterr = CrossValidatedTransferError(TransferError(self._clf),self._splitter,enable_states=["confusion"]) self.cvterr(self._dataset) # From the confusion matrix, calculate linkage and tree-structure # First prepare distance matrix from confusion matrix dist = self.cvterr.confusion.matrix dist = (dist+dist.T)/2 # Distance must be symmetric (property of a norm) dist = dist.max()-dist # Kind of inversion. High values in confusion -> similar -> small distance dist -= np.diag(np.diag(dist)) # Distance to self must be zero -> make diagonal elements zero # Calculate linkage matrix self.linkage = hcluster.linkage(hcluster.squareform(dist)) # Build tree and according TreeClassifier self.tree = hcluster.to_tree(self.linkage) self._tree_clf = self.build_tree_classifier_from_linkage_tree(self.tree)[0] self._tree_clf.train(self._dataset)
def get_clustering_as_tree(vectors, ward=True, clustering_distance='euclidean', clustering_method='complete', progress=progress): if ward: progress.update( 'Clustering data with Ward linkage and euclidean distances') clustering_result = hcluster.ward(vectors) else: progress.update('Computing distance matrix using "%s" distance' % clustering_distance) distance_matrix = hcluster.pdist(vectors, clustering_distance) progress.update('Clustering data with "%s" linkage' % clustering_method) clustering_result = hcluster.linkage(distance_matrix, method=clustering_method) progress.update('Returning results') return hcluster.to_tree(clustering_result)
def cluster_motifs(total_ranks_pickle, output_pickle, motif_dir='tmp/motif_dir/'): """ Input should be a total_ranks pickle outputted from cluster4 where it's a sorted list of (rank_index,fam,count,motif_filename,data,count_o,motif_size)s """ from hcluster import linkage, to_tree motifs = [] distance = [] total_ranks = load(open(total_ranks_pickle)) for rank_index,fam,count,motif_filename,data,count_o,motif_size in total_ranks: if rank_index < 300: continue # TODO:delete or change later? motifs.append({'file': motif_filename, 'info': "{0}_{1}".format(fam,count)}) print >> sys.stderr, "comparing {0} motifs.....".format(len(motifs)) for i in xrange(len(motifs)): print >> sys.stderr, "{0}/{1}".format(i,len(motifs)) for j in xrange(0,i): f1 = motifs[i]['file'] f2 = motifs[j]['file'] f1 = motif_dir + f1[:f1.find('.fna')] + '/' + f1 f2 = motif_dir + f2[:f2.find('.fna')] + '/' + f2 score = compare_motif(f1, f2)['score'] # the bigger the score (more similar), the smaller their distance # so we invert the score to 10000/score achieve that # however be careful that score of -9999999 means we should give it # a distance of...say...999999? if score > 0: distance.append( 10000./score ) else: distance.append( 999999. ) Z = linkage(distance, method='average') # delete below later f = open(output_pickle,'w') dump({'motifs':motifs,'Z':Z}, f) f.close() # delete above later # now Z is the linkage matrix, convert it to newick with proper naming root = to_tree(Z) print(hcluster_cnode_to_newick(root, motifs))
rowHeaders[N] = clase npTmp = np.array(dataTmp) dataMatrix.append(np.average(npTmp, axis=0)) print "Se cargaron los datos correctamente ..." print "Calculando la matriz de distancia ..." dataMatrix = np.array(dataMatrix) distanceMatrix = pdist(dataMatrix, "mahalanobis") print "Iniciando linkage. " linkageMatrix = fastcluster.linkage(distanceMatrix, "complete") # plot_with_labels(linkageMatrix, 4) print "Convirtiendo arbol generado en estructura ETE ..." tree = to_tree(linkageMatrix) # Ete section eimera = Tree() eimera.dist = 0 eimera.name = "eimera" item2node = {tree: eimera} to_visit = [tree] while to_visit: node = to_visit.pop() cl_dist = node.dist / 2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() ch.dist = cl_dist