def getBestKValue(zipcodes, maxClusters):
    if maxClusters == 1:
        return maxClusters
    else:
        gaps, s_k, K = gap.gap_statistic(zipcodes,
                                         refs=None,
                                         B=10,
                                         K=range(2, maxClusters),
                                         N_init=10)
        if len(gaps) == 0:
            print 1
            return 1
        bestKValue = gap.find_optimal_k(gaps, s_k, K)
        return bestKValue
Exemple #2
0
def main():

    file_path = sys.argv[1]
    output_path = sys.argv[2]

    #Obtain the SOM centroids and cluster them.

    #Open the file containing the SOM centroids.
    #Add all centroids to the list of data to cluster.
    som_centroids = []
    if os.path.exists(file_path):
        file = open(file_path, 'r')
        next_line = file.readline()
        while next_line:
            som_centroids.append([float(i) for i in next_line.split(",")])
            next_line = file.readline()

        #Compute the gap statistic and use it to find the best k-value.
        gaps, s_k, K = gap.gap_statistic(np.array(som_centroids),
                                         refs=None,
                                         B=10,
                                         K=range(1, len(som_centroids)),
                                         N_init=10)
        bestKValue = gap.find_optimal_k(gaps, s_k, K)

        #Print message to user.
        print("Optimal K is " + str(bestKValue))

        #Perform k-means clustering.
        kmeans = KMeans(n_clusters=bestKValue,
                        random_state=0).fit(np.array(som_centroids))

        #Print cluster centroids from hierarchical clustering.
        file = open(output_path, 'w')
        print_centroids(kmeans.cluster_centers_, som_centroids, file)

    #Print message to user.
    print("Clustering complete.")
Exemple #3
0
def eval(data, refs=None, nrefs=20, ks=range(2, 100), n_init=10):
    """
    Compute the Gap statistic for an nxm dataset in data using gapkmean (from standard Python library)

    Either give a precomputed set of reference distributions in refs as an (n,m,k) scipy array,
    or state the number k of reference distributions in nrefs for automatic generation with a
    uniformed distribution within the bounding box of data.

    Give the list of k-values for which you want to compute the statistic in ks.

    Reference
    ---------
    https://pypi.python.org/pypi/gapkmean/1.0

    """
    from gap import gap
    gaps, s_k, K = gap.gap_statistic(data,
                                     refs=refs,
                                     B=nrefs,
                                     K=ks,
                                     N_init=n_init)
    opt_k = gap.find_optimal_k(gaps, s_k, K)

    return opt_k