def testMurtaghUPGMA(self): if Murtagh is None: return nPts = 5 sz = 5 dataP = numpy.random.random((nPts, sz)) newClust = Murtagh.ClusterData(dataP, nPts, Murtagh.UPGMA)[0] ds = [] for i in range(nPts): for j in range(i): d = dataP[i] - dataP[j] ds.append(sum(d * d)) ds = numpy.array(ds) newClust2 = Murtagh.ClusterData(ds, nPts, Murtagh.UPGMA, isDistData=1)[0] assert len(newClust) == len(newClust2), 'length mismatch2' assert not newClust.Compare(newClust2, ignoreExtras=0), 'equality failed3' newClust2 = Murtagh.ClusterData(dataP, nPts, Murtagh.UPGMA, isDistData=0)[0] assert len(newClust) == len(newClust2), 'length mismatch2' assert not newClust.Compare(newClust2, ignoreExtras=0), 'equality failed3'
def ClusterPoints(data, metric, algorithmId, haveLabels=False, haveActs=True, returnDistances=False): message('Generating distance matrix.\n') dMat = GetDistanceMatrix(data, metric) message('Clustering\n') clustTree = Murtagh.ClusterData(dMat, len(data), algorithmId, isDistData=1)[0] acts = [] if haveActs and len(data[0]) > 2: # we've got activities... use them: acts = [int(x[2]) for x in data] if not haveLabels: labels = ['Mol: %s' % str(x[0]) for x in data] else: labels = [x[0] for x in data] clustTree._ptLabels = labels if acts: clustTree._ptValues = acts for pt in clustTree.GetPoints(): idx = pt.GetIndex() - 1 pt.SetName(labels[idx]) if acts: try: pt.SetData(int(acts[idx])) except Exception: pass if not returnDistances: return clustTree else: return clustTree, dMat
def WardsClustering(dists, nfps): print "-------------------------------------------------" print "starting Wards clustering" start_time = time.time() c_tree = Murtagh.ClusterData(dists, nfps, Murtagh.WARDS, isDistData=True) print "time taken: ", time.time() - start_time return c_tree
def gen_coarseclusters(dists, nfps): """ A function to generate coarse grained clusters (i.e. Murtagh) from Tanimoto distance matrices :param dists: Tanimoto distance matrix :param nfps: number of fingerprints :return: cs (clusters) """ from rdkit import DataStructs from rdkit.ML.Cluster import Murtagh # now cluster the data: cs = Murtagh.ClusterData(dists, nfps, Murtagh.WARDS, isDistData=1) return cs
def ClusterFps_Murtagh(self, dists, nfps, method, ncluster): self.cdict = {} cs = None if method == 'Wards': cs = Murtagh.ClusterData(dists, len(self.fplist), Murtagh.WARDS, isDistData=1) elif method == 'SLINK': cs = Murtagh.ClusterData(dists, len(self.fplist), Murtagh.SLINK, isDistData=1) elif method == 'CLINK': cs = Murtagh.ClusterData(dists, len(self.fplist), Murtagh.CLINK, isDistData=1) elif method == 'UPGMA': cs = Murtagh.ClusterData(dists, len(self.fplist), Murtagh.UPGMA, isDistData=1) splitClusts = ClusterUtils.SplitIntoNClusters(cs[0], ncluster) #centroids = [ClusterUtils.FindClusterCentroidFromDists(x,dists) for x in splitClusts] for index, cluster in enumerate(splitClusts): children = cluster.GetPoints() pts = [x.GetData() for x in children] self.clustdict[index + 1] = pts for pt in pts: self.cdict[pt] = [index + 1] if pt == pts[0]: self.cdict[pt].append("true") else: self.cdict[pt].append("flase")
[[10.0, 5.0], [20.0, 20.0], [30.0, 10.0], [30.0, 15.0], [5.0, 10.0]], numpy.float) print '2' #clusters = Murtagh.ClusterData(d,len(d),Murtagh.WARDS) #for i in range(len(clusters)): # clusters[i].Print() #print '3' dists = [] for i in range(len(d)): for j in range(i): dist = sum((d[i] - d[j])**2) dists.append(dist) dists = numpy.array(dists) print 'Wards:' clusters = Murtagh.ClusterData(dists, len(d), Murtagh.WARDS, isDistData=1) clusters[0].Print() print 'SLINK:' clusters = Murtagh.ClusterData(dists, len(d), Murtagh.SLINK, isDistData=1) clusters[0].Print() print 'CLINK:' clusters = Murtagh.ClusterData(dists, len(d), Murtagh.CLINK, isDistData=1) clusters[0].Print() print 'UPGMA:' clusters = Murtagh.ClusterData(dists, len(d), Murtagh.UPGMA, isDistData=1) clusters[0].Print()