def find_epsilon(feature_file, min_samp, kupa, number_of_features, dest): X = np.loadtxt(feature_file) X = StandardScaler().fit_transform(X) nearest_neighbors = NearestNeighbors(n_neighbors=min_samp + 1) neighbors = nearest_neighbors.fit(X) distances, indices = neighbors.kneighbors(X) distances = np.sort(distances[:, min_samp], axis=0) i = np.arange(len(distances)) knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial') epsilon = distances[knee.knee] fig = plt.figure(figsize=(5, 5)) knee.plot_knee() plt.title( f"Elbow for {kupa}\nMfcc-{number_of_features}-features\nEpsilon={epsilon}" ) plt.xlabel("Points") plt.ylabel("Distance") plt.savefig( f'{dest}results/Elbow_{kupa}_Mfcc_{number_of_features}_features.png') plt.close() print(epsilon) return epsilon
def feature_selection(data, target, method=c.XGB, verbose=False): if method == c.COR: correlation = data.corr() if verbose: sns.heatmap(correlation, cmap='Blues', annot=True) plt.show() return correlation.loc[(correlation[target] > 0.2) & (correlation[target] < 0.8)].index.tolist() else: xgb_params = { 'eta': 0.05, 'max_depth': 10, 'subsample': 1.0, 'colsample_bytree': 0.7, 'objective': 'reg:squarederror', 'eval_metric': 'rmse' } df = data.copy(deep=True) y = df[target] del df[target] x = df dtrain = xgb.DMatrix(x, y, feature_names=df.columns.values) model = xgb.train(xgb_params, dtrain, num_boost_round=1000) importance = model.get_score(importance_type='total_cover') imp = pd.DataFrame(importance, index=range(1)).T imp.columns = ["Importance"] imp = imp.sort_values(by=["Importance"], ascending=False) imp /= imp.sum() if verbose: sns.heatmap(imp, cmap='Blues', annot=True) plt.show() imp["x"] = range(len(imp)) # Online is a good parameter but you might wanna get rid of it if it gives a bad accuracy kneedle = KneeLocator(imp.x, imp.Importance, curve="convex", direction="decreasing", online=True) if verbose: kneedle.plot_knee() return imp.iloc[0:kneedle.knee].index.tolist() + [target]
def find_eps(dataset): nearest_neighbors = NearestNeighbors(n_neighbors=6) neighbors = nearest_neighbors.fit(dataset) distances, indices = neighbors.kneighbors(dataset) distances = np.sort(distances[:,5], axis=0) fig = plt.figure(figsize=(5, 5)) plt.plot(distances) plt.xlabel("Points") plt.ylabel("Distance") # plt.show() i = np.arange(len(distances)) knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial') fig = plt.figure(figsize=(5, 5)) knee.plot_knee() plt.xlabel("Points") plt.ylabel("Distance") plt.savefig("Distance_curve.png", dpi=300) return distances[knee.knee]
def knee_filt(data): newdata = [np.log(np.abs(v - f) + 1) for v, f in data.values()] hists, bins = np.histogram(newdata, 80) cdf = np.cumsum(hists) x = bins[1:] kneedle = KneeLocator(x, cdf, S=1.0, curve='concave', direction='increasing', online=True) kneedle.plot_knee() knee = kneedle.knee inverse = defaultdict(list) for attr, d in data.items(): if np.log(np.abs(d[0] - d[1]) + 1) >= knee: dev = cal_dev(d[0], d[1]) inverse[dev].append(attr) return inverse
def Power_decrease_rate(self, Fs, cutoff, plot: bool): t_ = self.time[self.valley_id].values P_by_lowpass = effectiv_trans.butter_lowpass_filter(data=self.power, cutoff=cutoff, Fs=Fs, order=1) P_by_lowpass = P_by_lowpass[self.valley_id] P_by_lowpass_series = pd.Series(P_by_lowpass, index=t_) # detection the inflection point kl = KneeLocator(t_[-round(len(t_) * 0.1):], P_by_lowpass[-round(len(P_by_lowpass) * 0.1):], curve="concave", direction="decreasing") if plot: kl.plot_knee() plt.show() inflection_pt_id = kl.knee max_id = P_by_lowpass_series.idxmax() self.P_max = max(P_by_lowpass_series) self.P_inflection = kl.knee_y delta_P = self.P_max - self.P_inflection delta_t = abs(max_id - inflection_pt_id) Pv = delta_P / delta_t Ps = self.P_max / delta_P return Pv, delta_P, self.P_max
def perform_pca(features, datasetLabel): data_PCA = PCA(random_state=120) data_eigen = data_PCA.fit(features) data_variance = data_eigen.explained_variance_ plot_features = np.arange(start=1, stop=features.shape[1]+1) data = {'variance': data_variance, 'features': plot_features } df = pd.DataFrame(data, columns=['variance', 'features']) kl = KneeLocator( plot_features, data_variance, curve="convex", direction="decreasing" ) print(kl.elbow) kl.plot_knee() plt.xlabel('Features') plt.ylabel('Variance') plt.title('Variance vs features') plt.grid(True) plt.savefig('plots/dr/pca/'+datasetLabel+'/variance_pca.png') plt.clf()
def set_n_pcs(self, min_n_pcs=5): # knee detection y = np.array(self.adata.uns['pca']['variance_ratio']) x = np.arange(len(y)) kneedle = KneeLocator(x, 1-y, S=self.pca_s, curve='concave', direction='increasing') self.n_pcs = max(kneedle.knee+1, min_n_pcs) # change to 1-based # plot fig, ax = plt.subplots() ax.plot(x+1, y, '-') # change to 1-based ax.axvline(x = self.n_pcs, color='red') ax.set_xlabel('PC') ax.set_ylabel('PCA variance ratio') ax.set_title('n_pcs={}, S={}'.format(self.n_pcs, self.pca_s)) fig.savefig(os.path.join(self.out, 'pca/pca_variance_ratio_cutoff.png')) kneedle.plot_knee() plt.savefig(os.path.join(self.out, 'pca/pca_kneedle.png')) # add rep with top pcs self.adata.obsm['X_pcs'] = self.adata.obsm['X_pca'][:, :self.n_pcs] return
def findKneeValue(mu): filenames = os.listdir(r'E:/BE PROJECT/Flask/static/frames') feature_list_np = np.array(globals.feature_list_q) myu=mu ysize = len(filenames)+1 neigh = NearestNeighbors(n_neighbors=myu) nbrs = neigh.fit(feature_list_np) dist, ind = nbrs.kneighbors(feature_list_np,return_distance=True) distanceDec = sorted(dist[:,myu-1], reverse=False) kn = KneeLocator(list(range(1,ysize)), distanceDec, curve='convex', direction='increasing') epsilon = np.interp(kn.knee, list(range(1,ysize)), distanceDec) kn.plot_knee() plt.xlabel('Sample points') plt.ylabel('Epsilon') plt.plot(list(range(1,ysize)), distanceDec) plt.hlines(epsilon, plt.xlim()[0], plt.xlim()[1], linestyles='dashed') print("Knee is at : {},{}".format(kn.knee,epsilon)) return epsilon
dimensions=(1, 2, 3, 4)) plt.show() #K-means #Elbow Method - SSE = Sum Squared Error sse = [] for k in range(1, 15): kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(x_norm) sse.append(kmeans.inertia_) kl = KneeLocator(range(1, 15), sse, curve='convex', direction='decreasing') nbr_cluster = kl.elbow KneeLocator.plot_knee(kl) plt.show() print(nbr_cluster) #Print the minimal SSE km = KMeans(n_clusters=4) km.fit(x_norm) print(km.inertia_) # Plotting the cluster centers and the data points on a 2D plane plt.scatter(ml_data['PC1'], ml_data['PC2']) plt.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], c='red',
# %% kneedle = KneeLocator(km_stat["n_clusters"], km_stat["wss"], S=1.0, curve='convex', direction='decreasing', online=False, interp_method="interp1d") print("The number of cluster according to elbow method:", kneedle.knee) print("The corresponding Within-Cluster-Sum of Squared Errors (WSS):", kneedle.knee_y) # %% # Plot Number of clusters against Within-Cluster-Sum of Squared Errors kneedle.plot_knee(figsize=plt_cfg.figsize) plt.xlabel("Number of clusters") plt.ylabel("Within-Cluster-Sum of Squared Errors") plt.xticks(np.arange(min(list_k), max(list_k) + 1, 1)) plt.tight_layout() plt.savefig("results/knee.png") plt.show() # %% # Plot the normalized knee curves kneedle.plot_knee_normalized(figsize=plt_cfg.figsize) plt.tight_layout() plt.savefig("results/knee_normalized.png") plt.show() # %% [markdown]
from scipy.interpolate import interp1d with open("sse_minibatch.json", "r") as f: sse_ = json.load(f) n_clusters = sorted([int(k) for k in sse_.keys()]) sse = {int(k): v for k, v in sse_.items()} y = [sse[k] for k in n_clusters] x = n_clusters # print(x) # f = interp1d(x, y) # x_new = np.arange(10, max(n_clusters)+1, 5) # print(x_new) # y_new = f(x_new) # plt.plot(x, y, 'o', x_new, y_new, '-') # plt.savefig("interp1d.png") # slope = get_1st_deriviatives(sse) # for i, j in zip(x_new, y_new): # print(i,j) # # # plt.style.use('fivethirtyeight') kneedle = KneeLocator(x, y, S=1.0, curve='convex', direction='decreasing', online=True, interp_method="polynomial") print(kneedle.knee) print(kneedle.knee_y) plt.style.use('fivethirtyeight') kneedle.plot_knee(figsize=(18, 7)) plt.savefig("knee.png") kneedle.plot_knee_normalized(figsize=(18, 7)) plt.savefig("knee_normal.png")
#Using df['close'] as the input array for clustering can also give out supports and resistances #X = np.array(df['close']) X = np.delete(X, 0) sum_of_squared_distances = [] K = range(1, 15) for k in K: km = KMeans(n_clusters=k) km = km.fit(X.reshape(-1, 1)) sum_of_squared_distances.append(km.inertia_) kn = KneeLocator(K, sum_of_squared_distances, S=1.0, curve="convex", direction="decreasing") kn.plot_knee() #plt.plot(sum_of_squared_distances) kmeans = KMeans(n_clusters=kn.knee).fit(X.reshape(-1, 1)) c = kmeans.predict(X.reshape(-1, 1)) minmax = [] for i in range(kn.knee): minmax.append([-np.inf, np.inf]) for i in range(len(X)): cluster = c[i] if X[i] > minmax[cluster][0]: minmax[cluster][0] = X[i] if X[i] < minmax[cluster][1]: minmax[cluster][1] = X[i] """" for i in range(len(X)):
def GMM_clustering_R(X_method_df, method, default_cluster_num=None): """Function to check BIC and perform GMM clustering on embedded dataset""" #First, import r packages and fix random seed: base = importr('base') mclust = importr('mclust') ro.r('set.seed(0)') #Now, check BIC and make a plot num_components_to_try = pd.Series(np.arange(1, 12)) #try up to 12 components with localconverter(ro.default_converter + pandas2ri.converter): ro.r('set.seed(0)') BIC_method = mclust.mclustBIC(X_method_df, G=num_components_to_try) model_names = [ 'EII', 'VII', 'EEI', 'VEI', 'EVI', 'VVI', 'EEE', 'EVE', 'VEE', 'VVE', 'EEV', 'VEV', 'EVV', 'VVV' ] sns.set(style="darkgrid") # sns.set_palette("tab10") BIC_method_df = pd.DataFrame(BIC_method, columns=model_names) BIC_method_df = BIC_method_df.dropna( axis=1) #drop parametrizations with NaNs # plt.figure() BIC_method_df.plot(marker='o') plt.title('GMM BIC on ' + method.__name__) #Now, find the knee point of the optimal BIC plot (the best GMM parametrization) best_parametrization = BIC_method_df.columns[BIC_method_df.max().argmax()] kneedle = KneeLocator(num_components_to_try, BIC_method_df[best_parametrization], S=1, curve='concave', direction='increasing', interp_method='polynomial') # plt.figure() kneedle.plot_knee() plt.title('GMM BIC on ' + method.__name__ + ': Knee Point') plt.xlabel('num_GMM_components') plt.ylabel('') print('Elbow point: {} components with BIC {}'.format( kneedle.knee, kneedle.knee_y)) #Pick the best number of GMM components: best_num_components = kneedle.knee - 1 if default_cluster_num is not None: best_num_components = default_cluster_num - 1 with localconverter(ro.default_converter + pandas2ri.converter): ro.r('set.seed(0)') mc = mclust.Mclust(X_method_df, G=pd.Series( [num_components_to_try[best_num_components]])) print(base.summary(mc)) print('Uncertainty quantiles:', np.quantile(mc[15], [0, 0.25, 0.5, 0.75, 1])) mc_dict = convert_to_python_dict(mc) method_model_name = mc_dict['modelName'] print(method_model_name) param = mc_dict['parameters'] method_means = np.array(convert_to_python_dict(param)['mean']) method_uncertainty = np.array(mc_dict['uncertainty']) method_z = np.array(convert_to_python_dict(mc)['z']) method_clusters = np.array( convert_to_python_dict(mc)['classification']) method_means = pd.DataFrame( method_means, columns=['V' + str(i + 1) for i in range(method_means.shape[1])]) return method_clusters, method_means, method_z, method_uncertainty
def analysis(STATE, method, method_kwargs, hyperparams_to_test, fig, spec, row, precomputed=False, separate=False, two_cols=False, NUM_STATES=1, configurations=None, default_cluster_num=5): #First, define appropriate paths SHAPE_PATH, FIGURE_PATH, RAW_DATA_PATH, INCOME_POPULATION_PATH = define_paths( STATE) #Load the data covid_, X, index_X, columns_X = load_data(RAW_DATA_PATH) #Do dim red print('##################D-RED#################') emb_method = method if not precomputed: errors_results, embeddings_results, trustws_results = choose_dimension( X, emb_method, hyperparams_to_test, **method_kwargs) save_obj(embeddings_results, STATE + '_embeddings_results' + method.__name__) save_obj(errors_results, STATE + '_errors_results' + method.__name__) save_obj(trustws_results, STATE + '_trustws_result' + method.__name__) if precomputed: embeddings_results = load_obj(STATE + '_embeddings_results' + method.__name__) errors_results = load_obj(STATE + '_errors_results' + method.__name__) trustws_results = load_obj(STATE + '_trustws_result' + method.__name__) if (len(hyperparams_to_test['n_components']) > 1) and (errors_results['n_components'][0] is not None): plt.plot(hyperparams_to_test['n_components'], errors_results['n_components']) if (len(hyperparams_to_test['n_components']) > 1): kneedle = KneeLocator(hyperparams_to_test['n_components'], np.array(trustws_results['n_components']), S=1, curve='concave', direction='increasing', interp_method='polynomial', online=False) kneedle.plot_knee() plt.title(emb_method.__name__ + ' trustworthiness') plt.xlabel('n_components') plt.ylabel('trustworhiness') kneedle.knee, kneedle.knee_y #Save the dataframe with optimal dim if (len(hyperparams_to_test['n_components']) > 1): good_dim = int( np.squeeze( np.where(hyperparams_to_test['n_components'] == kneedle.knee))) else: good_dim = 0 X_method = embeddings_results['n_components'][ good_dim] #pick the best (knee point) n_components X_method_df = pd.DataFrame( X_method, columns=['Mode {}'.format(i) for i in range(X_method.shape[1])]) #, index = index_X) X_method_df.to_csv( os.path.join( configurations['DATA_PATH'], 'interim', method.__name__ + str(X_method.shape[1]) + 'D_' + STATE + '.csv')) print('Saving optimal embedding. Method: ', method.__name__, 'shape: ', X_method_df.shape) print('##################INITIAL VIZ#################') #Find the 2D and 3D embeddings and continuous colors based on that filename_initial = os.path.join(FIGURE_PATH, 'initial_' + method.__name__) if method.__name__ == 'Isomap': viz = viz_Isomap if method.__name__ == 'SpectralEmbedding': viz = viz_SE if method.__name__ == 'LocallyLinearEmbedding': viz = viz_LLE if precomputed: load_path = os.path.join('obj', STATE) save_path = None else: load_path = None save_path = os.path.join('obj', STATE) X_2D_emb, X_3D_emb = viz(X, colors=None, filename=filename_initial, alpha=0.5, load_path=load_path, save_path=save_path) cos_colors = find_cos_similarity(X_2D_emb) #Color the manifold continuously filename_initial_colored = os.path.join( FIGURE_PATH, 'initial_' + method.__name__ + '_colored') X_2D_emb, X_3D_emb = viz(X, colors=cos_colors, filename=filename_initial_colored, cbar=None, alpha=0.5, load_path=load_path, save_path=save_path) print('##################GMM CLUSTERING#################') #Import R for clustering base = importr('base') mclust = importr('mclust') ro.r('set.seed(1)') dontprecomputeclusters = not precomputed # if not precomputed: if dontprecomputeclusters: clusters, means, z, uncertainty = GMM_clustering_R( X_method_df, method, default_cluster_num=default_cluster_num ) #could change this to 5 to be consistent across states to auto-id clust # clusters_block_indexed = pd.Series(clusters, index=index_X) avg_per_clust = create_avg_df(clusters, index_X, covid_) reordered_clusters, reordered_means, reordered_z, reordered_uncertainty = relabel_clusters( clusters.astype('int'), avg_per_clust, means, z, uncertainty) reordered_avg_per_clust = create_avg_df(reordered_clusters, index_X, covid_) #Save np.save( os.path.join('obj', STATE + '_reordered_clusters.npy'), reordered_clusters, ) reordered_means.to_csv( os.path.join('obj', STATE + '_reordered_means.csv')) reordered_z.to_csv(os.path.join('obj', STATE + '_reordered_z.csv')) np.save(os.path.join('obj', STATE + '_reordered_uncertainty.npy'), reordered_uncertainty) reordered_avg_per_clust.to_csv( os.path.join('obj', STATE + '_reordered_avg_per_clust.csv')) # if precomputed: if not dontprecomputeclusters: reordered_clusters = np.load( os.path.join('obj', STATE + '_reordered_clusters.npy')) reordered_means = pd.read_csv(os.path.join( 'obj', STATE + '_reordered_means.csv'), index_col=0) reordered_z = pd.read_csv(os.path.join('obj', STATE + '_reordered_z.csv'), index_col=0) reordered_uncertainty = np.load( os.path.join('obj', STATE + '_reordered_uncertainty.npy')) reordered_avg_per_clust = pd.read_csv(os.path.join( 'obj', STATE + '_reordered_avg_per_clust.csv'), index_col=0) #Save the data for Dennis (for only this method) index_with_blocks_and_save(STATE, X_method_df, X_2D_emb, X_3D_emb, reordered_clusters, reordered_z, reordered_uncertainty, index_X, emb_method) N_TIMESERIES = 5 closest_to_mean_samples, closest_to_mean_block_ids = find_closest_time_series( X_method_df, reordered_means, covid_, index_X, n=N_TIMESERIES) print('##################FINAL VIZ#################') sns.set(style="whitegrid") if two_cols: reordered_clusters = cos_colors #Change colors add_state_to_fig(STATE, fig, spec, row, NUM_STATES, X, reordered_clusters, index_X, reordered_avg_per_clust, load_path=load_path, save_path=save_path, separate=separate, two_cols=two_cols, configurations=configurations)
def clustering(): global target, daypara, df, df2, df_4pycaret, df_temp st.write(df) lookback = len(df.index) * (-1) X = np.array(df["price"][lookback:]) sum_of_squared_distances = [] K = range(1, 15) for k in K: km = KMeans(n_clusters=k) km = km.fit(X.reshape(-1, 1)) sum_of_squared_distances.append(km.inertia_) kn = KneeLocator(K, sum_of_squared_distances, S=1.0, curve="convex", direction="decreasing") kn.plot_knee(figsize=(7, 3)) st.set_option('deprecation.showPyplotGlobalUse', False) st.subheader("Search Number of Regime") st.pyplot() st.subheader("Plotting in Reallity") with st.spinner("Loading Chart..."): kmeans = KMeans(n_clusters=kn.knee).fit(X.reshape(-1, 1)) c = kmeans.predict(X.reshape(-1, 1)) minmax = [] for i in range(kn.knee): minmax.append([-np.inf, np.inf]) for i in range(len(X)): cluster = c[i] if X[i] > minmax[cluster][0]: minmax[cluster][0] = X[i] if X[i] < minmax[cluster][1]: minmax[cluster][1] = X[i] plt.figure(figsize=(11, 5), dpi=30) plt.title("Clustering Pressure/Support of {}".format(target), fontsize=20) plt.ylabel("price") index_p = [] index_s = [] a = np.transpose(minmax) a = np.sort(a) for i in range(len(X)): colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w'] c = kmeans.predict(X[i].reshape(-1, 1))[0] color = colors[c] if X[i] in a[1]: index_s.append(i) if X[i] in a[0]: index_p.append(i) plt.scatter(i, X[i], c=color, s=20, marker="o") for i in range(len(minmax)): plt.hlines(a[0][i], xmin=index_p[i] - 10, xmax=index_p[i] + 10, colors="red", linestyle="--") plt.text(index_p[i] - 15, a[0][i], "Pressure= {:.2f}".format(a[0][i]), fontsize=13) plt.hlines(a[1][i], xmin=index_s[i] - 10, xmax=index_s[i] + 10, colors="b", linestyle="--") plt.text(index_s[i] - 15, a[1][i], "Support= {:.2f}".format(a[1][i]), fontsize=13) st.set_option('deprecation.showPyplotGlobalUse', False) st.pyplot()
def identify_single_knee_point(x, y, plot=False): kl = KneeLocator(x, y, curve='convex', direction="increasing", S=5) if plot: kl.plot_knee() return kl.all_knees
from kneed import KneeLocator df = pd.read_csv("../ulabox_orders_with_categories_partials_2017.csv") # %% dfp = df[["Fresh%", "Food%", "Drinks%", "Home%", "Beauty%", "Health%", "Baby%", "Pets%"]] ssd = [] ks = range(1,11) for k in range(1,11): km = KMeans(n_clusters=k) km = km.fit(dfp) ssd.append(km.inertia_) kneedle = KneeLocator(ks, ssd, S=1.0, curve="convex", direction="decreasing") kneedle.plot_knee() plt.show() k = round(kneedle.knee) print(f"Number of clusters suggested by knee method: {k}") # %% kmeans = KMeans(n_clusters=k).fit(df[["Fresh%", "Food%", "Drinks%", "Home%", "Beauty%", "Health%", "Baby%", "Pets%"]]) #%% sns.histplot(x="total_items", data=df, multiple="stack", hue=kmeans.labels_) plt.show() #%% sns.displot(x=kmeans.labels_, y="discount%", data=df, palette='rainbow') plt.show()
minPts = 2 * dim kneighbours = minPts - 1 #Building K-distance graph to find optimal epsilon value and using kneed to get the exact value nearest_neighbors = NearestNeighbors(n_neighbors=kneighbours) neighbors = nearest_neighbors.fit(df_pca) distances, indices = neighbors.kneighbors(df_pca) distances = np.sort(distances[:, (kneighbours - 1)], axis=0) i = np.arange(len(distances)) knee = KneeLocator(i, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial') knee.plot_knee(figsize=[13.5, 9]) plt.xlabel("Points") plt.ylabel("Distance") plt.title('K-distance Graph', fontsize=15) optimal_eps = distances[knee.knee] plt.show() print('min_samples=' + str(minPts)) print('n_neighbours=' + str(kneighbours)) print('eps=' + str(optimal_eps)) dbscan = DBSCAN(eps=optimal_eps, min_samples=minPts) dbscan.fit(df_pca) dbscan_labels = dbscan.labels_ silhouette = silhouette_score(df_pca, dbscan.labels_)