def mClustering(dps, t): """ Perform m clustering. dps: data points, t: theta return the left boundary of desired range """ num_of_data_points = len(dps) # Number of data points keep_clustering = True k = 0 # used to generate the list of power of 2, e.g. k=0, 2**k=1...etc while keep_clustering: k += 1 if 2**(k) > num_of_data_points: print "Special case (2v > N), k* = n:", num_of_data_points quit() cluster_v, cohesion_v = km.kmeans(dps, 2**(k-1)) cluster_2v, cohesion_2v = km.kmeans(dps, 2**k) rate_of_chg = abs(cohesion_v - cohesion_2v) / (cohesion_v*abs(2**(k-1))) #print rate_of_chg if rate_of_chg < t: keep_clustering = False #print "Pick interval:", 2**(k-1), 2**k return 2**(k-2)
def binarySearch(left, dps, t): """ Perform binary search among the range [x,y], where y>x, z is the midpoint left: left boundary of desired range, dps: data points, t: theta return k-star """ right = 2*left kstar = 0 keep_search = True while keep_search: midpoint = (right+left)/2 # midpoint for binary search cluster_m, cohesion_m = km.kmeans(dps, midpoint) cluster_r, cohesion_r = km.kmeans(dps, right) rate_of_chg = abs(cohesion_r - cohesion_m) / (cohesion_m*abs(right-midpoint)) #print rate_of_chg if rate_of_chg > t: left = midpoint # [z,y] is qualified => stay at [z,y] if rate_of_chg < t: right = midpoint # [z,y] is not qualified => switch to [x,z] if right-left == 1: keep_search = False # Criteria is met, break loop... cluster_l, cohesion_l = km.kmeans(dps, left) # ... now output x or y, cluster_r, cohesion_r = km.kmeans(dps, right) # whichever gives better cohesion, if cohesion_l < cohesion_r: kstar = left # in other word, smaller diameter. else: kstar = right return kstar # print kstar