def testCompleteLinkage(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), linkage='complete') result = cl.getlevel(40) # sort the values to make the tests less prone to algorithm changes result = sorted([sorted(_) for _ in result]) expected = [ [24], [84], [124, 131, 134], [336, 365, 365], [391, 398], [518], [542, 564], [594], [676], [791], [835], [940, 956, 971], ] self.assertEqual(result, expected)
def getSubbatch(images, image_labels, similar_thred): sizes = [(image.shape[0], image.shape[1], idx) for idx, image in enumerate(images)] cl = HierarchicalClustering( sizes, lambda x, y: abs(x[0] - y[0]) + abs(x[1] - y[1])) clusters = cl.getlevel(similar_thred) subbatches = [] sorted(clusters, key=lambda cluster: len(cluster)) for cluster in clusters: if len(cluster) > 1: ideal_size = np.median(cluster, axis=0) ideal_size = [int(i) for i in ideal_size] subbatch_im = [] subbatch_label = [] for img in cluster: if img[0] != ideal_size[0] or img[1] != ideal_size[1]: subbatch_im.append( cv2.resize(images[img[2]], (ideal_size[1], ideal_size[0]))) subbatch_label.append( cv2.resize(image_labels[img[2]], (ideal_size[1], ideal_size[0]))) else: subbatch_im.append(images[img[2]]) subbatch_label.append(image_labels[img[2]]) subbatches.append({ 'images': np.array(subbatch_im), 'labels': np.array(subbatch_label) }) else: subbatches.append({ 'images': np.array([images[cluster[0][2]]]), 'labels': np.array([image_labels[cluster[0][2]]]) }) return subbatches
def testDataTypes(self): "Test for bug #?" cl = HierarchicalClustering(self.__data, self.sim) for item in cl.getlevel(0.5): self.assertEqual( type(item), type([]), "Every item should be a list!")
def testClusterLen1(self): """ Testing if hierarchical clustering a set of length 1 returns a set of length 1 """ cl = HierarchicalClustering([876], lambda x, y: abs(x - y)) self.assertEqual([876], cl.getlevel(40))
def testClusterLen1(self): """ Testing if hierarchical clustering a set of length 1 returns a set of length 1 """ cl = HierarchicalClustering([876], lambda x, y: abs(x - y)) self.assertCItemsEqual([876], cl.getlevel(40))
def testMultiprocessing(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), num_processes=4) new_data = [] [new_data.extend(_) for _ in cl.getlevel(40)] self.assertEqual(sorted(new_data), sorted(self.__data))
def buildHcluster(data, threshold): """ Description:Build Hierachical Cluster Input: data: e.g. data = [ [12,12],[34,34], [23,23],[32,32], [46,46],[96,96], [13,13],[1,1], [4,4],[9,9]] # The first variable is key, not counted for clustering threshold: threshold distance to break cluster Output: cluster record file /searchc/save/H.cluster """ print "Clustering..." a = datetime.datetime.now() cl = HierarchicalClustering(data,distance_function,'complete') clusterH = cl.getlevel(threshold) # get h clusters b = datetime.datetime.now() print "Naming..." featureAll = readFeature('all') c = nameCluster(clusterH,featureAll) name = c[0] centroid = c[1] writeCluster('H',clusterH,name,centroid,threshold) print "Writing..." with open(path+'/log/H_'+str(threshold)+'.log','w') as outfile: outfile.write("Hierahical Clustering Log\nDate:\t"+str(a.date())+"\nStart:\t"+str(a.time())+"\nEnd:\t"+str(b.time())+"\nDuration:\t"+str(b-a)+"\nH:\t"+str(threshold)+"\nMethod:\tComplete"+"\nNo. cluster:\t"+str(len(clusterH))+"\n\n") for cluster in clusterH: outfile.write(str(len(cluster)-2)+"\n") return
def clustertitle( request ): """cluster based on title and ngram sim""" from cluster import HierarchicalClustering def sim( a, b ): return 1 - NGram.compare( a.title, b.title, warp=WARP, iconv=enrich ) articles = Article.objects.filter( status = "live", date_published__gte = datetime.datetime.now() - datetime.timedelta(1) ).order_by( "date_published" )[:1000] cl = HierarchicalClustering(articles, sim) # 0.7 chosen pretty much through trial and error :) res = cl.getlevel(0.7) #import pprint #pprint.pprint( cl.topo() ) clusters = [] for cluster in res: if len(cluster) > 1: node = { 'type': 'cluster', #'topic': longest_common_substring(cluster[0].title, cluster[1].title), 'topic': common_terms( [a.title for a in cluster] ), 'articles': cluster } else: node = { 'type': 'article', 'article': cluster[0] } clusters.append(node) return render( request, "clusters.html", dictionary = { "clusters": clusters, } )
def test(data, expected): cl = HierarchicalClustering(data, lambda x, y: abs(x-y)) result = cl.getlevel(5) print(sorted(data)) print result print expected assert result == expected print 'ok'
def testCluster(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) cl.cluster() self.assertEqual( [[24], [84, 124, 131, 134], [336, 365, 365, 365, 398, 391], [940, 956, 971], [791], [835], [676], [518, 564, 542]], cl.getlevel(40))
def cluster(unigrams): DISTANCE_THRESHOLD = 0.2 # Feed the class your data and the scoring function hc = HierarchicalClustering(unigrams, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 20] return clusters
def testCluster(self): "Basic Hierachical clustering test with strings" cl = HierarchicalClustering(self.__data, self.sim) self.assertEqual( [['Nullam.'], ['Sed'], ['mi.'], ['ultricies'], ['Phasellus'], ['amet,', 'at'], ['sit', 'elit.', 'elit.', 'elit.'], ['leo', 'Lorem', 'dolor'], ['neque.', 'congue', 'consectetuer', 'consequat'], ['ipsum'], ['adipiscing']], cl.getlevel(0.5))
def testSingleLinkage(self): "Basic Hierarchical Clustering test with integers" def euclidian_distance(a, b): return sqrt(sum([pow(z[0] - z[1], 2) for z in zip(a, b)])) self.__data = [(1, 1), (1, 2), (1, 3)] cl = HierarchicalClustering(self.__data, euclidian_distance) result = cl.getlevel(40) self.assertIsNotNone(result)
def testIssue28(self): "Issue28 (Hierarchical Clustering)" points1D = { 'p4' : 5, 'p2' : 6, 'p7' : 10, 'p9' : 120, 'p10' : 121, 'p11' : 119, } distance_func = lambda a,b : abs(points1D[a]-points1D[b]) cl = HierarchicalClustering(list(points1D.keys()), distance_func) result = cl.getlevel(20) self.assertIsNotNone(result)
def ml(): global cluster_number input = db.session.query(ormQueue.number_of_people, ormQueue.queue_name).group_by( ormQueue.queue_name).all() queues, n_people, cluster_array = [], [], [] for elem in input: queues.append(elem.queue_name) n_people.append(int(elem.number_of_people)) cl = HierarchicalClustering(n_people, lambda x, y: abs(x - y)) res = cl.getlevel(5) Info = {'Queues_name': queues, 'Number_of_people': n_people} df = pd.DataFrame(Info, columns=['Queues_name', 'Number_of_people']) print(df) for number in range(0, len(res)): cluster_number = "Cluster" + str(number + 1) print(cluster_number) for elem in res[number]: print(elem) df.loc[df['Number_of_people'] == elem, 'Cluster'] = cluster_number print(df) df['randNumCol'] = np.random.randint(1, 6, df.shape[0]) print(df) pearsoncorr = df.corr(method='pearson') print(pearsoncorr) X = df['Number_of_people'] Y = df['randNumCol'] seed = 7 test_size = 0.25 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) # fit model no training data rf = RandomForestRegressor(n_estimators=1000, random_state=42) # Train the model on training data rf.fit([X_train], [y_train]) # Use the forest's predict method on the test data predictions = rf.predict([y_train]) # Calculate the absolute errors errors = abs(np.array(predictions) - np.array(X_test).reshape(-1, 1)) # Print out the mean absolute error (mae) print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.') return render_template('ML.html', name="Clasterization", name2="Correlation", name3="Regression Model", tables=[df.to_html()], error=errors, table=[pearsoncorr.to_html()], action="/ML")
def testCluster(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) cl.cluster() self.assertEqual([ [24], [84, 124, 131, 134], [336, 365, 365, 365, 398, 391], [940, 956, 971], [791], [835], [676], [518, 564, 542]], cl.getlevel(40))
def hierarchical_clustering_by_title(csv_file): csvReader = csv.DictReader(codecs.open(csv_file, "rb", "utf-16"), delimiter='\t', quotechar='"') csvReader.next() contacts = [row for row in csvReader] all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Current Position'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Current Position']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([title.strip() for title in title.split(separator) if title.strip() != '']) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles all_titles.extend(titles) all_titles = list(set(all_titles)) # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] # Round up contacts who are in these clusters and group them together clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append('%s %s' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
def urls_clustering(urls): # 输入 urls # 计算url之间的距离 # 使用difflib中的SequenceMatcher计算 def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # 执行层次聚类 hc = HierarchicalClustering(urls, distance) clusters = hc.getlevel(0.2) # pprint.pprint(clusters) return clusters
def breakToPeriods(arg, maximaOrder=20, clusteringGranularity = 0.5, file=False): inputAsList = [] if(file): file = open(arg, 'r') for line in file: inputAsList.append(float(line)) else: inputAsList = arg inputAsList = inputAsList if type(inputAsList) is list else inputAsList.tolist() a = np.array(inputAsList) localMax = argrelextrema(a, np.greater, 0, maximaOrder)[0].tolist() try: amplitude = np.max(a) - np.min(a) except: return [] cl = HierarchicalClustering(a.take(localMax).tolist(), lambda x,y: abs(x-y)) clusters = cl.getlevel(int(amplitude*clusteringGranularity)) if(len(clusters) == 0): return [] #print clusters max = 0 longestSeq = None if(len(clusters) == len(localMax)):#It clustered every maxima differently longestSeq = clusters else: for cluster in clusters: l = len(cluster) if(l>max): longestSeq = cluster max = l #print longestSeq if(len(longestSeq) < 2): return [] averageLength = len(inputAsList)/len(longestSeq) periods = [] indices = [inputAsList.index(x) for x in longestSeq] indices.sort() open = indices[0] for i in indices[1:]: #plt.figure() close = i strideLen = close - open if(strideLen > 0.5*averageLength and strideLen < 1.8*averageLength): period = inputAsList[open:close] periods.append(period) else: pass open = close return periods
def testCluster(self): "Basic Hierachical clustering test with strings" cl = HierarchicalClustering(self.__data, self.sim) self.assertEqual([ ['ultricies'], ['Sed'], ['Phasellus'], ['mi'], ['Nullam'], ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'], ['leo', 'Lorem', 'dolor'], ['congue', 'neque', 'consectetuer', 'consequat'], ['adipiscing'], ['ipsum'], ], cl.getlevel(0.5))
def getCorners(intersections): cl = HierarchicalClustering(intersections, lambda p1, p2: length([p1, p2])) clusters = cl.getlevel(25) # probably want to make sure we actually have the corners at this point. # For now, I'm taking the 4 biggest clusters. cornerClusters = sorted(clusters, key=len, reverse=True)[:4] corners = map(averageCoords, cornerClusters) corners = sorted(corners, key= lambda p: p[0]) left = sorted(corners[:2], key=lambda p: p[1]) right = sorted(corners[2:], key=lambda p: p[1]) #{'top-left': left[0], 'bottom-left': left[1], # 'top-right': right[0], 'bottom-right': right[1]} return left[0], left[1], right[0], right[1]
def main(): pC = PhamCluster() pC.initialize_matrix() #pC.calculate_distances() #print 'scoreMatrix:', pC.scoreMatrix #print 'distMatrix:', pC.distMatrix cl = HierarchicalClustering(pC.scoreMatrix, lambda x,y: pC.get_distance(x,y)) #cutoff = raw_input('specify cutoff level:') cutoff = 1 print 'using cutoff of 1' clusters = cl.getlevel(float(cutoff)) print 'there are', len(clusters), 'clusters' print clusters print 'there are', len(clusters), 'clusters'
def testCluster(self): "Basic Hierachical clustering test with strings" self.skipTest('These values lead to non-deterministic results. ' 'This makes it untestable!') cl = HierarchicalClustering(self.__data, self.sim) self.assertEqual([ ['ultricies'], ['Sed'], ['Phasellus'], ['mi'], ['Nullam'], ['sit', 'elit', 'elit', 'Ut', 'amet', 'at'], ['leo', 'Lorem', 'dolor'], ['congue', 'neque', 'consectetuer', 'consequat'], ['adipiscing'], ['ipsum'], ], cl.getlevel(0.5))
def testSingleLinkage(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) result = cl.getlevel(40) # sort the values to make the tests less prone to algorithm changes result = [sorted(_) for _ in result] self.assertCItemsEqual([ [24], [336, 365, 365, 391, 398], [518, 542, 564, 594], [676], [791], [835], [84, 124, 131, 134], [940, 956, 971], ], result)
def get_music_bars(filename): musicpage = Image.open(filename) pixels = musicpage.load() width, height = musicpage.size imgmat = [sum([1 for x in range(width) if pixels[x,y] == 0]) for y in range(height)] toplines = sorted(imgmat, reverse=True) tophundred = toplines[0:400] lineguesses = [i for i, j in enumerate(imgmat) if j in tophundred] cl = HierarchicalClustering(lineguesses, lambda x,y: abs(x-y)) staves = [x for x in cl.getlevel(15) if len(x) > 2] bands = [[min(x), max(x)] for x in staves] bars = [b for b in bands if b[1] - b[0] > 20] return bars
def testUCLUS(self): "Basic Hierarchical Clustering test with integers" cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), linkage='uclus') expected = [ [24], [84], [124, 131, 134], [336, 365, 365, 391, 398], [518, 542, 564], [594], [676], [791], [835], [940, 956, 971], ] result = sorted([sorted(_) for _ in cl.getlevel(40)]) self.assertEqual(result, expected)
def testAverageLinkage(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y), linkage='average') # TODO: The current test-data does not really trigger a difference # between UCLUS and "average" linkage. expected = [ [24], [84], [124, 131, 134], [336, 365, 365, 391, 398], [518, 542, 564], [594], [676], [791], [835], [940, 956, 971], ] result = sorted([sorted(_) for _ in cl.getlevel(40)]) self.assertEqual(result, expected)
def hac(topic): """ Use clusters.HierarchicalClustering https://pypi.python.org/pypi/cluster/1.1.0b1 """ phrases = [phrase for phrase in topic if phrase.get('es_phrase')] # Feed the class your data and the scoring function hc = HierarchicalClustering(phrases, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # print "[hac]",len(clusters), json.dumps(clusters, indent=2) # sometimes the clustering api returns a list of dicts instead # of a list of lists. This causes an error in topic_extraction # as we are looping over the phrases if len(clusters) == 1 and isinstance(clusters[0], dict): clusters = [clusters] return clusters
def set_new_level(self, level): # Create the clusters cl = HierarchicalClustering(self._data, self._relative_levenshtein) clusteredData = cl.getlevel(level) self._parsed_clusteredData = self._parse(clusteredData) self._column_names = [ 'Group %d' % i for i in xrange(len(clusteredData)) ] # Start with the treeview and liststore creation dynamicListStoreTypes = [str for i in xrange(len(self._column_names))] self.liststore = apply(gtk.ListStore, dynamicListStoreTypes) gtk.TreeView.__init__(self, self.liststore) # Show horizontal and vertical lines self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH) # First clear the treeview for col in self.get_columns(): self.remove_column(col) # Internal variables self.current_path = None self.current_column = None self._colDict = {} for i, cname in enumerate(self._column_names): colObject = gtk.TreeViewColumn(cname) self.append_column(colObject) textRenderer = gtk.CellRendererText() colObject.pack_start(textRenderer, True) colObject.set_attributes(textRenderer, text=i) # Save this for later. See FIXME below. self._colDict[colObject] = i for i in self._parsed_clusteredData: self.liststore.append(i)
def set_new_level(self, level): # Create the clusters cl = HierarchicalClustering(self._data, self._relative_levenshtein) clusteredData = cl.getlevel(level) self._parsed_clusteredData = self._parse(clusteredData) self._column_names = ['Group %d' % i for i in xrange(len( clusteredData))] # Start with the treeview and liststore creation dynamicListStoreTypes = [str for i in xrange(len(self._column_names))] self.liststore = apply(gtk.ListStore, dynamicListStoreTypes) gtk.TreeView.__init__(self, self.liststore) # Show horizontal and vertical lines self.set_grid_lines(gtk.TREE_VIEW_GRID_LINES_BOTH) # First clear the treeview for col in self.get_columns(): self.remove_column(col) # Internal variables self.current_path = None self.current_column = None self._colDict = {} for i, cname in enumerate(self._column_names): colObject = gtk.TreeViewColumn(cname) self.append_column(colObject) textRenderer = gtk.CellRendererText() colObject.pack_start(textRenderer, True) colObject.set_attributes(textRenderer, text=i) # Save this for later. See FIXME below. self._colDict[colObject] = i for i in self._parsed_clusteredData: self.liststore.append(i)
def testClusterLen0(self): """ Testing if hierarchical clustering an empty list returns an empty list """ cl = HierarchicalClustering([], lambda x, y: abs(x - y)) self.assertEqual([], cl.getlevel(40))
def testUnmodifiedData(self): cl = HierarchicalClustering(self.__data, self.sim) new_data = [] [new_data.extend(_) for _ in cl.getlevel(0.5)] self.assertEqual(sorted(new_data), sorted(self.__data))
def cluster_contacts_by_title(csv_file): transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] separators = ['/', 'and', '&'] csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"') contacts = [row for row in csvReader] # Normalize and/or replace known abbreviations # and build up list of common titles all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Job Title'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Job Title']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([title.strip() for title in title.split(separator) if title.strip() != '']) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles all_titles.extend(titles) all_titles = list(set(all_titles)) print "Scoring...." , "\n" # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] # Round up contacts who are in these clusters and group them together print "Clustering contacts by title...." , "\n" clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append('%s %s' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
def test2(): cl = HierarchicalClustering(data, lambda x, y: abs(x - y)) new_data = [] for row in cl.getlevel(40): print(row) print(data)
def _align_features_cluster(self, m, rt_diff_cutoff, fdr_cutoff, aligned_fdr_cutoff, method): """ Align features by clustering all peakgroups This algorithm will find the best peakgroup cluster over all runs and then select all peakgroups belonging to the cluster. It does not treat heavy/light specially (they are treated like two independent runs). """ verb = self.verbose if verb: print "00000000000000000000000000000000000 new peptide (cluster)", m.getAllPeptides( )[0].get_id() # i) get all RTs above the cutoff for p in m.getAllPeptides(): # loop over all peptides pg = p.get_best_peakgroup() if verb: print "best rt", pg.get_normalized_retentiontime( ), pg.peptide.run.get_id(), pg.get_fdr_score() groups = [ pg for p in m.getAllPeptides() # loop over all peptides for pg in p.get_all_peakgroups() # loop over all peakgroups if pg.get_fdr_score() < aligned_fdr_cutoff ] # Check for empty groups if len(groups) == 0: return # do the clustering from cluster import HierarchicalClustering cl = HierarchicalClustering( groups, lambda x, y: abs(x.get_normalized_retentiontime() - y. get_normalized_retentiontime())) clusters_rt = cl.getlevel( rt_diff_cutoff) # for large clusters, this is the the bottleneck! clusters_rt_obj = [Cluster(c) for c in clusters_rt] # if there was only one group, we need to prepare a special object of size one if len(groups) == 1: clusters_rt_obj = [Cluster(groups)] if verb: print "==== Clusters " # make sure only one is selected from each run... for i, c in enumerate(clusters_rt_obj): c.select_one_per_run(self.verbose) if verb: print " - Cluster with score", c.getTotalScore(), "at", \ c.getMedianRT(), "+/-", c.getRTstd() , "(norm_score %s)" %\ (float(c.getTotalScore())/((aligned_fdr_cutoff/2)**len(c.peakgroups))) for pg in c.peakgroups: print " = Have member", pg.print_out() # Get best cluster by length-normalized best score. # Length normalization divides the score by the expected probability # values if all peakgroups were chosen randomly (assuming equal # probability between 0 and aligned_fdr_cutoff, the expected value # for a random peakgroup is "aligned_fdr_cutoff/2") and thus the # expected random value of n peakgroups would be (aligned_fdr_cutoff/2)^n bestcluster = min(clusters_rt_obj, key=(lambda x: x.getTotalScore() / (( (aligned_fdr_cutoff / 2)**len(c.peakgroups))))) clusters_rt_obj.sort(lambda x, y: cmp( x.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(x.peakgroups)), y.getTotalScore() / ((aligned_fdr_cutoff / 2)**len(y.peakgroups)))) for i, c in enumerate(clusters_rt_obj): for pg in c.peakgroups: pg.setClusterID(i + 1)
def testUnmodifiedData(self): cl = HierarchicalClustering(self.__data, lambda x, y: abs(x - y)) new_data = [] [new_data.extend(_) for _ in cl.getlevel(40)] self.assertEqual(sorted(new_data), sorted(self.__data))
#pt.printt() #sort_list = fdist.keys() #print sort_list print "Clustering Musics" # Define a scoring function def score(music1, music2): return DISTANCE(set(music1), set(music2)) # Feed the class your data and the scoring function hc = HierarchicalClustering(musics, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] ######## End: HAC ######## # Round up musics who are in these clusters and group them together clustered_musics = {} for cluster in clusters: clustered_musics[tuple(cluster)] = [] for idx, music in enumerate(musics): for tag in music:
######## Begin: HAC ######## # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters # clusters = [c for c in clusters if len(c) > 1] ######## End: HAC ######## # Round up contacts who are in these clusters and group them together clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append(
data = [24, 84, 124, 131, 134, 336, 365, 365, 391, 398, 518, 542, 564, 594, 676, 791, 835, 940, 956, 971] data2 = [791, 956, 676, 124, 564, 84, 24, 365, 594, 940, 398, 971, 131, 365, 542, 336, 518, 835, 134, 391] def test2(): cl = HierarchicalClustering(data, lambda x, y: abs(x - y)) new_data = [] for row in cl.getlevel(40): print(row) print(data) #[new_data.extend(_) for _ in cl.getlevel(40)] #self.assertEqual(sorted(new_data), sorted(self.__data)) def run(level): print('Level = {}'.format(level)) cluster = HierarchicalClustering(data, lambda x, y: abs(x-y)) result = cluster.getlevel(level) for row in result: print(row) print(data) run(40) #print(len(data)) #test2() cl = HierarchicalClustering(data, lambda x, y: abs(x - y)) cl.getlevel(40) print(sorted(data) == sorted(data2))
n_sequence = zip(*n_sequence) n_sequence = [''.join(i) for i in n_sequence] n_sequence.sort(key=len, reverse=True) p_size = 0 new_seq = [] chunk = [] for seq in n_sequence: size = len(seq) if size == p_size: new_seq.append(seq) else: p_size = size if chunk: cl = HierarchicalClustering(chunk, lambda x, y: distance(x, y)) cl.getlevel(1) new_seq += cl print(new_seq) chunk = [] print(len(n_sequence)) print(len(identifiers)) out.write(bytes(''.join(identifiers), 'UTF-8')) out.write(bytes('\n', 'UTF-8')) out.write(bytes('\n'.join(n_sequence), 'UTF-8')) out.write(bytes('\n', 'UTF-8')) out.close() in_size = os.path.getsize(in_file) out_size = os.path.getsize(out_file)
def cluster_contacts_by_title(csv_file): def score(title1, title2): return DISTANCE((title1), (title2)) all_titles = [] all_titles.append("Student") all_titles.append("Assistant Professor") all_titles.append("Student Ambassador") all_titles.append("Assistant Developer") all_titles.append("Human Resources") all_titles.append("Software Developer") all_titles.append("Head, Technical Affairs-Software") all_titles.append("Sofware Engineer") all_titles.append("Software Engineer") all_titles.append("Design Secretary") all_titles.append("Telesales Executive") all_titles.append("Filmmaker") all_titles.append("Writer") all_titles.append("Data Developer") all_titles.append("Software Developmer") all_titles.append("Co-founder") all_titles.append("Assistant Manager") all_titles.append("Management Trainee - Operations") all_titles.append("Oracle Database Administrator") all_titles.append("Key Account Manager") all_titles.append("Engineering Manager") all_titles.append("Talent Acquisition Manager") all_titles.append("Wireless Protocol Test Intern") all_titles.append("HR Executive") all_titles.append("IT Company") all_titles.append("Business Development Manager") all_titles.append("Member of Technical Staff") all_titles.append("Web Designer") all_titles.append("ECE Student") all_titles.append("Intern") all_titles.append("Head of Growth") all_titles.append("SA") all_titles.append("Manager (Technology)") all_titles.append("Systems Engineer") all_titles.append("Technical Team Member") all_titles.append("Business Developer") all_titles.append("system engineer") all_titles.append("Infrastructure Developer") all_titles.append("Engineer") all_titles.append("Mechanical Engineer") all_titles.append("Student Technical Assistant") all_titles.append("Senior Software Engineer") all_titles.append("Senior Software Developer") all_titles.append("Associate Professor") all_titles.append("Professor") all_titles.append("Software developer") all_titles.append("Director - Software Engineering") all_titles.append("Product Manager") hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) print clusters score_matrix = [] min_d = 1000000 for title1 in all_titles: temp = [] for title2 in all_titles: li1 = title1.split(",") li2 = title2.split(",") for ll1 in li1: min_d = 100000 for ll2 in li2: # print ll1,ll2 d = score(ll1, ll2) # print d min_d = min(min_d, d) #print "done" # print d temp.append(min_d) score_matrix.append(temp) # print score_matrix print len(all_titles) i = j = k = l = 0 mini = 10000 for l1 in score_matrix: j = 0 for l2 in l1: if l2 < mini and i != j: mini = l2 k = i l = j j = j + 1 i = i + 1 # print "%d %d",(k,l) # print mini clusters = [c for c in clusters if len(c) > 1] # print clusters # Round up contacts who are in these clusters and group them together transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] separators = ['/', 'and', '&'] csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"') contacts = [row for row in csvReader] all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Job Title'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Job Title']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([ title.strip() for title in title.split(separator) if title.strip() != '' ]) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append( '%s %s ' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
def cluster_contacts_by_title(csv_file): transforms = [ ('Sr.', 'Senior'), ('Sr', 'Senior'), ('Jr.', 'Junior'), ('Jr', 'Junior'), ('CEO', 'Chief Executive Officer'), ('COO', 'Chief Operating Officer'), ('CTO', 'Chief Technology Officer'), ('CFO', 'Chief Finance Officer'), ('VP', 'Vice President'), ] separators = ['/', 'and', '&'] csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"') contacts = [row for row in csvReader] # Normalize and/or replace known abbreviations # and build up list of common titles all_titles = [] for i, _ in enumerate(contacts): if contacts[i]['Job Title'] == '': contacts[i]['Job Titles'] = [''] continue titles = [contacts[i]['Job Title']] for title in titles: for separator in separators: if title.find(separator) >= 0: titles.remove(title) titles.extend([ title.strip() for title in title.split(separator) if title.strip() != '' ]) for transform in transforms: titles = [title.replace(*transform) for title in titles] contacts[i]['Job Titles'] = titles all_titles.extend(titles) all_titles = list(set(all_titles)) print "Scoring....", "\n" # Define a scoring function def score(title1, title2): return DISTANCE(set(title1.split()), set(title2.split())) # Feed the class your data and the scoring function hc = HierarchicalClustering(all_titles, score) # Cluster the data according to a distance threshold clusters = hc.getlevel(DISTANCE_THRESHOLD) # Remove singleton clusters clusters = [c for c in clusters if len(c) > 1] # Round up contacts who are in these clusters and group them together print "Clustering contacts by title....", "\n" clustered_contacts = {} for cluster in clusters: clustered_contacts[tuple(cluster)] = [] for contact in contacts: for title in contact['Job Titles']: if title in cluster: clustered_contacts[tuple(cluster)].append( '%s %s' % (contact['First Name'], contact['Last Name'])) return clustered_contacts
def run(level): print('Level = {}'.format(level)) cluster = HierarchicalClustering(data, lambda x, y: abs(x-y)) result = cluster.getlevel(level) for row in result: print(row)