def getBestKValue(zipcodes, maxClusters): if maxClusters == 1: return maxClusters else: gaps, s_k, K = gap.gap_statistic(zipcodes, refs=None, B=10, K=range(2, maxClusters), N_init=10) if len(gaps) == 0: print 1 return 1 bestKValue = gap.find_optimal_k(gaps, s_k, K) return bestKValue
def main(): file_path = sys.argv[1] output_path = sys.argv[2] #Obtain the SOM centroids and cluster them. #Open the file containing the SOM centroids. #Add all centroids to the list of data to cluster. som_centroids = [] if os.path.exists(file_path): file = open(file_path, 'r') next_line = file.readline() while next_line: som_centroids.append([float(i) for i in next_line.split(",")]) next_line = file.readline() #Compute the gap statistic and use it to find the best k-value. gaps, s_k, K = gap.gap_statistic(np.array(som_centroids), refs=None, B=10, K=range(1, len(som_centroids)), N_init=10) bestKValue = gap.find_optimal_k(gaps, s_k, K) #Print message to user. print("Optimal K is " + str(bestKValue)) #Perform k-means clustering. kmeans = KMeans(n_clusters=bestKValue, random_state=0).fit(np.array(som_centroids)) #Print cluster centroids from hierarchical clustering. file = open(output_path, 'w') print_centroids(kmeans.cluster_centers_, som_centroids, file) #Print message to user. print("Clustering complete.")
def eval(data, refs=None, nrefs=20, ks=range(2, 100), n_init=10): """ Compute the Gap statistic for an nxm dataset in data using gapkmean (from standard Python library) Either give a precomputed set of reference distributions in refs as an (n,m,k) scipy array, or state the number k of reference distributions in nrefs for automatic generation with a uniformed distribution within the bounding box of data. Give the list of k-values for which you want to compute the statistic in ks. Reference --------- https://pypi.python.org/pypi/gapkmean/1.0 """ from gap import gap gaps, s_k, K = gap.gap_statistic(data, refs=refs, B=nrefs, K=ks, N_init=n_init) opt_k = gap.find_optimal_k(gaps, s_k, K) return opt_k