""" This script uses the elbow method to help identify a good value of 'k' to use for k-means clustering. @author: Chris McCormick """ from scipy.spatial.distance import cdist, pdist from sklearn.cluster import KMeans from simsearch import SimSearch import numpy as np import matplotlib.pyplot as plt # Load the pre-built corpus. print('Loading the saved SimSearch and corpus...') (ksearch, ssearch) = SimSearch.load(save_dir='./mhc_corpus/') # Get the dataset to be clustered. # Note - The index is store with all of the vectors *already normalized*. X = ssearch.index.index # If you needed to normalize the vectors: # norms = np.linalg.norm(X) # norms = norms.reshape(-1, 1) # X = X / norms # These lists will store the actual values to plot. plotx = [] ploty1 = [] ploty2 = []