def testClusterLen0(self): """ Testing if hierarchical clustering an empty list returns an empty list """ cl = HierarchicalClustering([], lambda x, y: abs(x - y)) self.assertEqual([], cl.getlevel(40))
def testMultiprocessing(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), num_processes=4) new_data = [] [new_data.extend(_) for _ in cl.getlevel(40)] self.assertEqual(sorted(new_data), sorted(self.__data))
def testUnmodifiedData(self): cl = HierarchicalClustering(self.__data, self.sim) new_data = [] [new_data.extend(_) for _ in cl.getlevel(0.5)] self.assertEqual(sorted(new_data), sorted(self.__data))
def testDataTypes(self): "Test for bug #?" cl = HierarchicalClustering(self.__data, self.sim) for item in cl.getlevel(0.5): self.assertEqual(type(item), type([]), "Every item should be a list!")
def testUnmodifiedData(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) new_data = [] [new_data.extend(_) for _ in cl.getlevel(40)] self.assertEqual(sorted(new_data), sorted(self.__data))
def find_symelems(sele_or_xforms="all", verbose=False): xforms = sele_or_xforms if isinstance(sele_or_xforms, basestring): xforms, maxrms = get_xforms_by_chain(sele_or_xforms, verbose=True) elif not isinstance(sele_or_xforms, dict): raise ValueError symelems = list() maxangerr = 0.0 for c, x in xforms.items(): assert len(c) == 2 assert isinstance(x, Xform) if c[0] == c[1]: continue dis = x.t.length() if dis > 5.0: continue axis, ang = x.rotation_axis() nfold = round(math.pi * 2.0 / ang) angerr = abs(ang - math.pi * 2.0 / nfold) * 180.0 / math.pi if verbose: print "candidate symelem:", nfold, c, angerr, axis if angerr > 360.0 / nfold / 8.0: continue # require unambiguous symelems maxangerr = max(maxangerr, angerr * nfold) symelems.append((nfold, axis, c, angerr)) def symelemdis(x, y): return line_line_angle_degrees( x[1], y[1]) if x[0] == y[0] else 9e9 if verbose: for se1, se2 in filter(lambda t: t[0] < t[1], product(symelems, symelems)): if se1[0] == se2[0]: print se1 print se2 print symelemdis(se1, se2), "degrees" print hier = HierarchicalClustering(symelems, symelemdis) thresh = 6.0 clusters = hier.getlevel(thresh) print "number of symmetry element clusters at threshold", thresh, "degrees is", len(clusters) centers0 = list() maxaxiserr = 0.0 for clust in clusters: print "symelem cluster:", clust center = list(clust[0]) center[2] = list((center[2],)) for i in range(1, len(clust)): ax = clust[i][1] center[1] = center[1] + (ax if ax.dot(center[1]) > 0 else -ax) center[2].append(clust[i][2]) center[3] = max(center[3], clust[i][3]) center[1].normalize() centers0.append(center) axiserr = 0.0 for c in clust: axiserr = max(axiserr, 1.0 - abs(center[1].dot(c[1]))) maxaxiserr = max(maxaxiserr, axiserr) # sort on nfold, then on number of chain pairs in cluster centers0 = sorted(centers0, cmp=lambda x, y: cmp( y[0], x[0]) if x[0] != y[0] else cmp(len(y[2]), len(x[2]))) centers = list() for center in centers0: if verbose: print "DEBUG prune center:", center seenit = False for censeen in centers: remainder = abs((censeen[0] / center[0]) % 1.0) if verbose: print " ", remainder, censeen if remainder > 0.01: continue # not a symmetry multiple if 1.0 - abs(center[1].dot(censeen[1])) < 0.01: seenit = True # axis are same if not seenit: centers.append(center) print "centers:" cen_of_geom = com("(" + sele_or_xforms + ") and (name CA and not HET)") for center in centers: print center # if center[0]>2.1: continue # showvecfrompoint(50*center[1],cen_of_geom) return centers, maxrms, maxangerr, maxaxiserr
def _align_features_cluster(self, m, rt_diff_cutoff, fdr_cutoff, aligned_fdr_cutoff, method): """ Align features by clustering all peakgroups This algorithm will find the best peakgroup cluster over all runs and then select all peakgroups belonging to the cluster. It does not treat heavy/light specially (they are treated like two independent runs). """ verb = self.verbose if verb: print "00000000000000000000000000000000000 new peptide (cluster)", m.getAllPeptides( )[0].get_id() # i) get all RTs above the cutoff for p in m.getAllPeptides(): # loop over all peptides pg = p.get_best_peakgroup() if verb: print "best rt", pg.get_normalized_retentiontime( ), pg.peptide.run.get_id(), pg.get_fdr_score() groups = [ pg for p in m.getAllPeptides() # loop over all peptides for pg in p.get_all_peakgroups() # loop over all peakgroups if pg.get_fdr_score() < aligned_fdr_cutoff ] # Check for empty groups if len(groups) == 0: return # do the clustering from cluster import HierarchicalClustering cl = HierarchicalClustering( groups, lambda x, y: abs(x.get_normalized_retentiontime() - y. get_normalized_retentiontime())) clusters_rt = cl.getlevel( rt_diff_cutoff) # for large clusters, this is the the bottleneck! clusters_rt_obj = [Cluster(c) for c in clusters_rt] # if there was only one group, we need to prepare a special object of size one if len(groups) == 1: clusters_rt_obj = [Cluster(groups)] if verb: print "==== Clusters " # make sure only one is selected from each run... for i, c in enumerate(clusters_rt_obj): c.select_one_per_run(self.verbose) if verb: print " - Cluster with score", c.getTotalScore(), "at", \ c.getMedianRT(), "+/-", c.getRTstd() , "(norm_score %s)" %\ (float(c.getTotalScore())/((aligned_fdr_cutoff/2)**len(c.peakgroups))) for pg in c.peakgroups: print " = Have member", pg.print_out() # Get best cluster by length-normalized best score. # Length normalization divides the score by the expected probability # values if all peakgroups were chosen randomly (assuming equal # probability between 0 and aligned_fdr_cutoff, the expected value # for a random peakgroup is "aligned_fdr_cutoff/2") and thus the # expected random value of n peakgroups would be (aligned_fdr_cutoff/2)^n bestcluster = min(clusters_rt_obj, key=(lambda x: x.getTotalScore() / (( (aligned_fdr_cutoff / 2)**len(c.peakgroups))))) clusters_rt_obj.sort(lambda x, y: cmp( x.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(x.peakgroups)), y.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(y.peakgroups)))) for i, c in enumerate(clusters_rt_obj): for pg in c.peakgroups: pg.setClusterID(i + 1)
def cluster_contacts_by_title(csv_file): transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] separators = ['/', 'and', '&'] csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"') contacts = [row for row in csvReader] # Normalize and/or replace known abbreviations # and build up list of common titles all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Job Title'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Job Title']] for title in titles: for separator in separators: if title.find(separator) >= 0: #titles.remove(title) titles.extend([ title.strip() for title in title.split(separator) if title.strip() != '' ]) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles all_titles.extend(titles) all_titles = list(set(all_titles)) # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] # Round up contacts who are in these clusters and group them together clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append( '%s %s' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
all_titles.extend(titles) all_titles = list(set(all_titles)) ######## Begin: HAC ######## # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters # clusters = [c for c in clusters if len(c) > 1] ######## End: HAC ######## # Round up contacts who are in these clusters and group them together clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = []
def run(data): "Basic Hierachical clustering test with strings" cl = HierarchicalClustering(data, sim) print(cl.getlevel(0.5))
'http://view.inews.qq.com/a/20170613A03NY600', 'http://view.inews.qq.com/a/20170612A065L900', 'http://view.inews.qq.com/a/20170611A03T5V00', 'http://news_and_blog.qq.com/zt2017/thirteen_in/allpc.htm', 'http://media.news_and_blog.qq.com/mediaplus/home.htm', 'http://news_and_blog.qq.com/zt2014/2014qtnews/ccybspxd.htm', 'http://weather.news_and_blog.qq.com/', 'http://news_and_blog.qq.com/newssh/shwx/shehuiwanxiang.htm', 'http://news_and_blog.qq.com/dc_article2016/tagsList.htm?tags=%E5%81%87%E5%A9%9A%E5%A7%BB', 'http://news_and_blog.qq.com/', 'http://news_and_blog.qq.com/', 'http://news_and_blog.qq.com/zt2014/2014qtnews/ccybspxd.htm', 'http://news_and_blog.qq.com/dc_article2016/tagsList.htm?tags=%E9%82%BB%E5%B1%85', 'http://news_and_blog.qq.com/', 'http://news_and_blog.qq.com/zt2014/2014qtnews/ccybspxd.htm', 'http://news_and_blog.qq.com/original/tuhua/spacedelivery.html', 'http://news_and_blog.qq.com/photon/photostory/eyhz.htm', ] # 计算url之间的距离 # 使用difflib中的SequenceMatcher计算 def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # 执行层次聚类 hc = HierarchicalClustering(urls, distance) clusters = hc.getlevel(0.2) pprint.pprint(clusters)
'tweets': 'no', 'phd': 'yes' }, False)] dt = DecisionTree() tree = dt.build_tree_id3(inputs) print(tree) new_input = {'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'} label = dt.classify(tree, new_input) print(label) inputs = [[19, 28], [21, 27], [20, 23], [28, 13], [11, 15], [13, 13], [-49, 0], [-46, 5], [-41, 8], [-49, 15], [-34, -1], [-22, -16], [-19, -11], [-25, -9], [-11, -6], [-12, -8], [-14, -5], [-18, -3], [-13, -19], [-9, -16]] hcl = HierarchicalClustering() base_cluster = hcl.bottom_up_cluster(inputs) print(base_cluster) clusters = hcl.generate_clusters(base_cluster, 3) print(clusters) users = Table(["user_id", "name", "num_friends"]) users.insert([0, "Hero", 0]) users.insert([1, "Dunn", 2]) users.insert([2, "Sue", 3]) users.insert([3, "Chi", 3]) users.insert([4, "Thor", 3]) users.insert([5, "Clive", 2]) users.insert([6, "Hicks", 3]) users.insert([7, "Devin", 2]) users.insert([8, "Kate", 2])