def clustering(fname="clustering.png"): # Create side-by-side axes grid _, axes = plt.subplots(ncols=2, figsize=(18,6)) X, y = make_blobs(centers=7) # Add K-Elbow to the left oz = KElbowVisualizer(MiniBatchKMeans(), k=(3,12), ax=axes[0]) oz.fit(X, y) oz.finalize() # Add SilhouetteVisualizer to the right oz = SilhouetteVisualizer(Birch(n_clusters=5), ax=axes[1]) oz.fit(X, y) oz.finalize() # Save figure path = os.path.join(FIGURES, fname) plt.tight_layout() plt.savefig(path)
def makeK(d, ilist, title): d = np.array(d) kk = pd.DataFrame({ 'Variance': d[:, 0], 'Skewness': d[:, 1], 'Kurtosis': d[:, 2] }) K = 20 model = KMeans() visualizer = KElbowVisualizer(model, k=(1, K)) kIdx = visualizer.fit(kk) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure kIdx = kIdx.elbow_value_ model = KMeans(n_clusters=kIdx).fit(kk) # scatter plot fig = plt.figure() ax = Axes3D(fig) #.add_subplot(111)) cmap = plt.get_cmap('gnuplot') clr = [cmap(i) for i in np.linspace(0, 1, kIdx)] for i in range(0, kIdx): ind = (model.labels_ == i) ax.scatter(d[ind, 2], d[ind, 1], d[ind, 0], s=30, c=clr[i], label='Cluster %d' % i) ax.set_xlabel("Kurtosis") ax.set_ylabel("Skew") ax.set_zlabel("Variance") plt.title(title + ': KMeans clustering with K=%d' % kIdx) plt.legend() plt.savefig(title + "clustersnoises.png") plt.show() d = pd.DataFrame( { 'Variance': d[:, 0], 'Skewness': d[:, 1], 'Kurtosis': d[:, 2], 'Alpha': d[:, 3], 'Beta': d[:, 4], "Psi": d[:, 5], "Cluster": model.labels_ }, index=ilist) return d
def elbow_test(self, df, k_values): # extract var_list = self.variable_list path_out = self.save_path # check if a directory for elbow test exists path_out = os.path.join(path_out, 'Elbow test results') if not os.path.exists(path_out): os.makedirs(path_out) # based on distortion score plt.figure() elbow_k = cluster.KMeans() visualizer = KElbowVisualizer(elbow_k, k=(min(k_values), max(k_values))) visualizer.fit(df[var_list]) visualizer.show( outpath=os.path.join(path_out, "kelbow_minibatchkmeans.jpg")) optimal_k = {'Distortion score': visualizer.knee_value} # based on calinski_harabasz plt.figure() visualizer = KElbowVisualizer(elbow_k, k=(min(k_values), max(k_values)), metric='calinski_harabasz', timings=False, locate_elbow=True) visualizer.fit(df[var_list]) visualizer.show( outpath=os.path.join(path_out, "kelbow_calinski-harabasz.jpg")) optimal_k['Calinski-Harabasz score'] = visualizer.knee_value # based on silhouette score plt.figure() visualizer = KElbowVisualizer(elbow_k, k=(min(k_values), max(k_values)), metric='silhouette', timings=False, locate_elbow=True) visualizer.fit(df[var_list]) visualizer.show(outpath=os.path.join(path_out, "silhouette.jpg")) optimal_k['Silhouette score'] = visualizer.get_params() # set the optimal values of k self.optimal_k = optimal_k
clf = PCA(random_state=0, ) print(clf) results = clf.fit_transform(X_train) model = KMeans( random_state=0, n_jobs=-1, ) # https://www.scikit-yb.org/en/latest/api/cluster/elbow.html visualizer = KElbowVisualizer(model, k=(1, 20)) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.KElbowVisualizer.png") visualizer.poof() model = KMeans( n_clusters=4, random_state=0, n_jobs=-1, ) visualizer = InterclusterDistance(model) visualizer.fit(results) # Fit the data to the visualizer # Finalize and render the figure visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png") visualizer.poof()
#FINALMENTE, gerando o RFM DATAFRAME last_date = order_payment['order_delivered_carrier_date'].max() + timedelta( days=1) rfm = order_payment.groupby('customer_id').agg({ 'order_delivered_carrier_date': lambda x: (last_date - x.max()).days, 'order_id': lambda x: len(x), 'payment_value': 'sum' }) rfm.dropna(inplace=True) std = StandardScaler() x_std = std.fit_transform(rfm) model = KMeans() visualizer = KElbowVisualizer(model, k=(4, 12)) visualizer.fit(x_std) visualizer.show() model_k = KMeans(n_clusters=4) kmeans = model_k.fit(x_std) rfm['cluster'] = kmeans.labels_ rfm.columns = ['Recency', 'Frequency', 'MonetaryValue', 'cluster'] rfm.head()
h5f.close() h5f = h5py.File( '/models/mccikpc2/CPI-analysis/cnn/model_t5_epochs_50_dense64_3a_aux.h5', 'r') test_idx = h5f['test_idx'][:] h5f.close() model = KMeans() plt.ion() plt.show() # elbow plot plt.figure() visualizer = KElbowVisualizer(model, k=(2, 30), timings=True, verbose=1) visualizer.fit(cod1[test_idx]) # silouette plot plt.figure() visualizer = KElbowVisualizer(model, k=(2, 30), metric='silhouette', timings=True, verbose=1) visualizer.fit(cod1[test_idx]) # gaussian mixtures BIC & AIC # https://jakevdp.github.io/PythonDataScienceHandbook/05.12-gaussian-mixtures.html plt.figure() n_components = np.arange(1, 21) models = [
# # centroids = model.cluster_centers_ # 聚类中心 # kmeans_plot(X_train, centroids) from sklearn.cluster import KMeans from sklearn.cluster import DBSCAN from yellowbrick.cluster import KElbowVisualizer xmin = np.min(X_train[:, 0]) ymin = np.min(X_train[:, 1]) xmax = np.max(X_train[:, 0]) ymax = np.max(X_train[:, 1]) kmeans = KElbowVisualizer(KMeans(), k=(2, 10)) # kmeans = KMeans(n_clusters=n_clusters, init='random', n_init=1, random_state=0, max_iter=100) kmeans.fit(X_train) n_clusters = kmeans.elbow_value_ kmeans = KMeans(n_clusters=n_clusters, init='random', n_init=1, random_state=0, max_iter=100) kmeans.fit(X_train) y_kmeans = kmeans.predict(X_train) # cluster index for each observation centers = kmeans.cluster_centers_ # cluster center coordinates fig, ax = plt.subplots() plt.scatter(X_train[:, 0], X_train[:, 1], c=y_kmeans, s=5, cmap='summer') plt.scatter(centers[:, 0], centers[:, 1], c='black', s=100, alpha=0.5) from scipy.spatial import Voronoi, voronoi_plot_2d from scipy.spatial import ConvexHull, convex_hull_plot_2d
#plt.plot(K, distortions, 'm*-') #plt.title('Elbow Method with distortion') #plt.xlabel('Value of k') #plt.ylabel('Distortion') #plt.vlines(4,0,25000,colors='red',linestyles ="dashed") #plt.grid() #plt.show() ############################################################################### #elbow 2 from yellowbrick.cluster import KElbowVisualizer model = kmeans visualizer = KElbowVisualizer(model, k=(4, 12)) plt.figure(11) visualizer.fit(X) visualizer.show() # # # # #X= df.loc[df.index,['Position','Count']].to_numpy() #num_clusters = 3 #kmeans = KMeans(n_clusters = num_clusters).fit(X) #labels = kmeans.labels_ #n_clusters_ = kmeans.cluster_centers_ # # ################# 1 #distortions = []
plt.close() print('finished Part {}: data: {} PCA'.format(p, c)) ###################### # K-Means Clustering Baseline ###################### fig=plt.figure(figsize=(17,8)) fig.suptitle('Part: {} Clustering Baseline data: {}'.format(p,c),size=16) plt.subplot(131) # plot distortion: mean sum of squared distances to centers ax1 = plt.subplot(1, 3, 1) model = KMeans(n_init=1000) visualizer = KElbowVisualizer(model, k=(2,12),timings=False,metric='distortion', ax = ax1) visualizer.fit(data_p_Base[areas]) # Fit the data to the visualizer #visualizer.show() # plot silhouette: mean ratio of intra-cluster and nearest-cluster distance ax2 = plt.subplot(1, 3, 2) model = KMeans(n_init=1000) visualizer = KElbowVisualizer(model, k=(2,12),timings=False,metric='silhouette', ax = ax2) visualizer.fit(data_p_Base[areas]) # Fit the data to the visualizer #visualizer.show() #plot calinski_harabasz: ratio of within to between cluster dispersion ax3 = plt.subplot(1, 3, 3) model = KMeans(n_init=1000) visualizer = KElbowVisualizer(model, k=(2,12),timings=False,metric='calinski_harabasz', ax = ax3) visualizer.fit(np.array(data_p_Base[areas])) # Fit the data to the visualizer #visualizer.show()
#%% # Instantiate the clustering model and visualizer if ask_user('Bepalen optimaal # clusters met elbow'): print('Optimaal aantal clusters voor KMeans bepalen !') vectorizer = TfidfVectorizer(stop_words=my_stopwords, max_features=300) X = vectorizer.fit_transform(documentstxtclean) Xdf = pd.DataFrame(X.toarray()) model = KMeans() visualizer = KElbowVisualizer(model, k=range(2, 51, 1), metric='calinski_harabasz', timings=False) visualizer.fit(Xdf) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure else: print('Optimaal aantal clusters voor KMeans niet bepaald !') #%% Toevoegen bepaalde clusters aan incidentgegevens # Resultaat van de elbow is een breekpunt bij 10 en tussen de 18 en 22 if ask_user('Toevoegen clusters aan data'): print('Clustergegevens worden aan de dataset toegevoegd!') sse = {} k = 12 kmeans = KMeans(n_clusters=k, max_iter=50).fit(Xdf) data['clusters'] = kmeans.labels_ sse[k] = kmeans.inertia_ # Inertia: Sum of distances of samples to their closest cluster center else:
# clusters = 3 kmeans3 = KMeans(n_clusters = 3, init = 'k-means++') pred3 = kmeans3.fit_predict(selected) print(pred3) kmeans3.cluster_centers_ # clusters = 4 kmeans4 = KMeans(n_clusters = 4, init = 'k-means++') pred4 = kmeans4.fit_predict(selected) kmeans4.cluster_centers_ # clusters = 5 kmeans5 = KMeans(n_clusters = 5, init = 'k-means++') pred5 = kmeans5.fit_predict(selected) kmeans5.cluster_centers_ # correlation c = selected.corr() # visualization array = selected.to_numpy() plt.scatter(array[:,0], array[:,1], c= pred3, cmap = 'rainbow') plt.scatter(array[:,0], array[:,1], c= pred4, cmap = 'rainbow') plt.scatter(array[:,0], array[:,1], c= pred5, cmap = 'rainbow') # elbow with yellow brick from yellowbrick.cluster import KElbowVisualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 9)) visualizer.fit(array) visualizer.show()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file", type=str, required=True) # file we are reading parser.add_argument("--write_file", type=str, required=True) parser.add_argument("--stopwords", default=None, type=str) # stopwords file name parser.add_argument( "--min_threshold", type=int, default=10 ) # cluster the word whose appearence is larger than the threshold parser.add_argument( "--min_num_words", type=int, default=3 ) # discard the word whose appearence is less than the threshold parser.add_argument("--start", default=0, type=int) parser.add_argument("--end", default=-1, type=int) args = parser.parse_args() if args.stopwords: with open(args.stopwords, encoding="utf-8") as f: stopwords = f.readlines() stopwords = [st.strip() for st in stopwords] else: stopwords = [] rootdir = args.input_file writefile = args.write_file list_dir = os.listdir(rootdir) if args.end == -1: args.end = len(list_dir) none_cluster = [] discard_word = [] for file_index, file_name in enumerate(tqdm(list_dir)): if not (file_index >= args.start and file_index < args.end): continue write_mode = "a" labels = [] vectors = [] if file_name[:-4] != '' and ".txt" in file_name: with open(rootdir + file_name) as f: line = f.readline() while (line): line = line.split("\t") labels.append(line[0]) vectors.append([ line[0], np.array(line[1].split(" "), dtype='float'), np.array(line[2].split(" "), dtype='float') ]) #[[label, vector_src, vector_tgt]] line = f.readline() if len(vectors) < args.min_num_words: discard_word.append(file_name[:-4]) continue if len(vectors) <= args.min_threshold or checkspecial( file_name[:-4]) or file_name[:-4] in stopwords: cluster_src, cluster_tgt, cluster_label, cluster_entropy = get_mean_vector( vectors, labels) else: vectors_src_all = np.vstack(list(map(lambda x: x[1], vectors))) model = KElbowVisualizer(KMeans(), k=(1, 8)) model.fit(vectors_src_all) if model.elbow_value_ == None: none_cluster.append(file_name[:-4]) cluster_src, cluster_tgt, cluster_label, cluster_entropy = get_mean_vector( vectors, labels) else: cluster_src, cluster_tgt, cluster_label, cluster_entropy = get_muti_mean_vector( vectors_src_all, vectors, labels, model.elbow_value_) write_file(cluster_src, cluster_tgt, cluster_label, cluster_entropy, write_mode, file_name, writefile) print("Number of None in clustering:", len(none_cluster)) print("Number of words that have been discarded:", len(discard_word)) print("List of file that not been clusted:", none_cluster) print("List of words that have been discarded:", discard_word)
def main(): """ Using k-means for some data exploration and a potential solution for the license prediction problem """ os.chdir('../../../all_files_generated') current_dir = os.getcwd() data_pickles_dir = os.path.join(current_dir, 'data_pickles') elbow_method_files_dir = os.path.join(current_dir, 'elbow_method_files') x_train_path = os.path.join(data_pickles_dir, 'x_train.pickle') x_validation_path = os.path.join(data_pickles_dir, 'x_validation.pickle') x_test_path = os.path.join(data_pickles_dir, 'x_test.pickle') y_train_path = os.path.join(data_pickles_dir, 'y_train.pickle') y_validation_path = os.path.join(data_pickles_dir, 'y_validation.pickle') y_test_path = os.path.join(data_pickles_dir, 'y_test.pickle') # read in all pickle files that may be required with open(x_train_path, 'rb') as data: x_train = pickle.load(data) with open(x_validation_path, 'rb') as data: x_validation = pickle.load(data) with open(x_test_path, 'rb') as data: x_test = pickle.load(data) with open(y_train_path, 'rb') as data: y_train = pickle.load(data) with open(y_validation_path, 'rb') as data: y_validation = pickle.load(data) with open(y_test_path, 'rb') as data: y_test = pickle.load(data) # combine all datasets x_train = sparse.vstack( (x_train, x_validation, x_test)) # scipy.sparse.csr matrix y_train = y_train.append(pd.Series(y_validation)) # pandas series y_train = y_train.append(pd.Series(y_test)) # pandas series use_yellowbrick = False if use_yellowbrick: license_classifier = KMeans() visualizer = KElbowVisualizer(license_classifier, k=(2, 100)) visualizer.fit(x_train) visualizer.show() else: inertia = [] k = range(2, 100) for i in k: license_classifier = KMeans(n_clusters=i) license_classifier.fit(x_train) inertia.append(license_classifier.inertia_) plt.plot(k, inertia) plt.xlabel('K') plt.ylabel('Inertia') plt.title('Elbow Method') elbow_method_path = os.path.join( elbow_method_files_dir, 'k_means_clustering_elbow_method.png') plt.savefig(elbow_method_path) plt.show()
# Instantiate the clustering model and visualizer """ finding optimal k using silhouette score """ # silhouette_score={ } k_range=w=list(range(4,20)) #specify the range of k for k in k_range: clusterer= KMeans(n_clusters=k, init='k-means++', random_state=20) cluster_labels=clusterer.fit_predict(df_kmean.values)#Compute cluster centers and predict cluster index for each sample #print(cluster_labels) #the silhouette score gives the av. value for all the samples #this gives a percepective into density and seperation of the formed cluster silhouette_avg=silhouette_score(df_kmean.values,cluster_labels ) #silhouette_score[k]=silhouette_avg #to store the k and its corresponding silhouette score print(f"for k cluster={k}, the av silhouette_score is {silhouette_avg}") #we can see that k>= 13, we get good score #I will use k=15 as the elbow method, because when I compared the dissimilarity and similarity perforamce # , using optimal_k_compare, I saw thatk=15 gives better result """ finding optimal k using visualization (elbow) """s model = KMeans(init='k-means++') visualizer = KElbowVisualizer(model, k=(4,100), timings=False) #k is th enumber of cluter visualizer.fit(df_kmean.values) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
plt.xlabel("Number of points in node (or index of point if no parenthesis).") plt.ylabel('Distance') plt.savefig('Figures/Cluster_analysis/Dendrogram_hierarch_clustering.png') plt.close() ## Elbow Plot '''Uses the Within-Cluster Sum-of-Squares and selects the model with the lowest sum of squares ''' ### Initialize model and load package from yellowbrick.cluster import KElbowVisualizer sns.set(style = 'whitegrid', font_scale = 1.5) model = KMeans() ### Make the Elbow plot visualizer = KElbowVisualizer(model, k=(2,15), timings= True, size = (1500,900)) visualizer.fit(df[vars_tot]) visualizer.show('Figures/Cluster_analysis/WCSS.png') plt.clf() '''OLD fig, ax = plt.subplots(figsize=(20,12)) ax.set(ylabel='Within-Cluster Sum-of-Squares (1e20)', xlabel = 'Number of Clusters') ax.plot(np.arange(2,12,1),intertia / 1e20) plt.xticks(np.arange(2,12,1)) plt.tight_layout() fig.savefig('Figures/Cluster_analysis/WCSS.png') ''' '''NOTE There is a clear elbow at Five clusters. Proceed. ''' ## Silhouette plot ### Initialize model
for k in K: kmeans = KMeans(n_clusters=k) kmeans_model = kmeans.fit(df) ssd.append(kmeans_model.inertia_) plt.plot(K, ssd, "bx-") plt.xlabel("Farklı K Değerlerine Karşılık Uzaklık Artık Toplamları") plt.title("Optimum Küme Sayısı için Elbow Yöntemi") plt.show() #Optimum küme sayısını seçerken grafiğe bakılarak seçerken en büyük kırılma #olan yere odaklanılır ve o kadar küme oluşturulması istenir. (örnek=3) visu = KElbowVisualizer(kmeans, k=(2, 20)) visu_fit = visu.fit(df) visu_fit.poof() #visu sayesinde grafikte en iyi küme seçilerek bize gösterilmesi sağlanır #Final Model kmeans = KMeans(n_clusters=4) kmeans_model = kmeans.fit(df) print(kmeans_model) kumeler = kmeans_model.labels_ kume = pd.DataFrame({"Eyaletler": df.index, "Kumeler": kumeler}) print(kume) df["Kume_No"] = kumeler print(df)
from warnings import filterwarnings filterwarnings('ignore') from yellowbrick.cluster import KElbowVisualizer from sklearn.cluster import KMeans import pandas as pd df = pd.read_csv("USArrests.csv", sep=',').copy() df.index = df.iloc[:, 0] df = df.iloc[:, 1:5] del df.index.name kmeans = KMeans() visualizer = KElbowVisualizer(kmeans, k=(2, 20)) visualizer.fit(df) visualizer.poof() kmeans = KMeans(n_clusters=4) k_fit = kmeans.fit(df) kumeler = k_fit.labels_ pd.DataFrame({"Eyaletler": df.index, "Kumeler": kumeler})
def cluster_category_data(df, scale_data='minmax', dim_red_method='som', use_elbow_method='True', cluster_method='hierarchical', n_clusters=None, verbose=1, perplexity=None): """ :param df: dataframe containing all the columns belonging to a category to be used in clustering :param scale_data: method to be used to scale the dataset :param dim_red_method: options are 'som', 'umap', 'tsne', None. If None, do clustering directly. :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required). :param n_clusters: If use_elbow_method is False, n_clusters needs to be given. :param verbose: If True, output the progress in clustering process :param perplexity: If method used is TSNE, perplexity nedds to be specified """ t = time.time() if scale_data == 'minmax': X = MinMaxScaler().fit_transform(df) elif scale_data == 'standard': X = StandardScaler().fit_transform(df) else: X = df.values if verbose: print(f'number of features = {df.shape[1]}') if dim_red_method == 'som': if verbose: print( 'Self Organising Maps is being used for dimensionality reduction...' ) opt_k = 2 max_s = -1 f = 0 for mapsize in [(30, 30)]: if verbose: print(f'map size = {mapsize}') sm = SOMFactory().build(X, normalization='var', initialization='pca', mapsize=mapsize) sm.train(n_job=1, verbose=False, train_rough_len=100, train_finetune_len=500) if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(sm.codebook.matrix).elbow_value_ if elbow and verbose: print(f'elbow value = {elbow}') if not elbow: if verbose: print('elbow not found') ms = -1 for k in range(2, 20): km_labels = KMeans(k).fit_predict(sm.codebook.matrix) s = silhouette_score(sm.codebook.matrix, km_labels) if s > ms: elbow = k else: elbow = n_clusters x = sm.project_data(X) labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method) clabels = [] for i in range(X.shape[0]): clabels.append(labels[x[i]]) s_score = silhouette_score(X, clabels) if verbose: print(f'silhouette score = {round(s_score, 3)}') max_s = max(s_score, max_s) if (max_s == s_score): opt_k = elbow opt_labels = clabels opt_size = mapsize if (max_s > s_score): break if verbose: print(f'optimum mapsize = {opt_size}') print( f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}' ) print(f'time taken = {round(time.time()-t,1)}') return opt_labels, opt_k elif dim_red_method: if dim_red_method == 'umap': print('UMAP is being used for dimensionality reduction...') embedding = umap.UMAP(n_components=2, n_neighbors=5, min_dist=0.0001, metric='euclidean', random_state=1, spread=0.5, n_epochs=1000).fit_transform(X) print('UMAP embedding done...') elif dim_red_method == 'tsne': print('t-SNE is being used for dimensionality reduction...') embedding = TSNE(perplexity=perplexity).fit_transform(X) print('t-SNE embedding is done...') if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(embedding).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(embedding) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding) if verbose: s_score = silhouette_score(X, opt_labels) print( f'number of clusters = {elbow} and silhouette_score = {s_score}' ) return opt_labels, elbow else: if use_elbow_method: model = KElbowVisualizer(KMeans(), k=20, timings=False) elbow = model.fit(X).elbow_value_ else: elbow = n_clusters if cluster_method == 'kmeans': opt_labels = KMeans(elbow).fit_predict(X) elif cluster_method == 'hierarchical': opt_labels = AgglomerativeClustering(elbow).fit_predict(X) print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}') return opt_labels, elbow
def home2(request): global articles global sujets today = datetime.date.today() yesterday = today - datetime.timedelta(days=1) yesterday2 = today - datetime.timedelta(days=2) aujourd = '"' + str(today) + '"' yestday = '"' + str(yesterday) + '"' yestday2 = '"' + str(yesterday2) + '"' query = "today" url = "https://rapidapi.p.rapidapi.com/api/search/NewsSearchAPI" for date in [aujourd, yestday, yestday2]: print(date) querystring = { "pageSize": "100", "q": query, "autoCorrect": "true", "pageNumber": "1", "toPublishedDate": "null", "withThumbnails": "true", "fromPublishedDate": date, "safeSearch": "true" } headers = { 'x-rapidapi-host': "contextualwebsearch-websearch-v1.p.rapidapi.com", 'x-rapidapi-key': "a089200dbamshd00bb86da392cd7p19dd23jsn694f8679b489" } response = requests.request("GET", url, headers=headers, params=querystring) detaille1 = json_normalize(response.json(), 'value') detaille = pd.DataFrame(columns=detaille1.columns) detaille = pd.concat([detaille, detaille1]) detaille.reset_index(drop=True, inplace=True) detaille_article = detaille[~(detaille.id.duplicated())] detaille_article = detaille_article[~detaille_article.title.isna()] detaille_article = detaille_article[~(detaille_article.body.str.isspace())] detaille_article.loc[:, 'complet'] = detaille_article["title"] + " " + detaille_article["title"] + " " + \ detaille_article['body'] tfidf = TfidfVectorizer(tokenizer=extract_entite_nomme) dtm = tfidf.fit_transform(detaille_article.complet) x = dtm.toarray() model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 40)) visualizer.fit(dtm) # Fit the data to the visualizer visualizer.show() nombre_cluster = visualizer.elbow_value_ k_means = KMeans(n_clusters=nombre_cluster, random_state=42) k_means.fit(dtm) closest, _ = pairwise_distances_argmin_min(k_means.cluster_centers_, x) all_data = [i for i in range(detaille_article.id.size)] m_clusters = k_means.labels_.tolist() centers = np.array(k_means.cluster_centers_) closest_data = [] for i in range(nombre_cluster): center_vec = centers[i] data_idx_within_i_cluster = [ idx for idx, clu_num in enumerate(m_clusters) if clu_num == i ] one_cluster_tf_matrix = np.zeros( (len(data_idx_within_i_cluster), centers.shape[1])) for row_num, data_idx in enumerate(data_idx_within_i_cluster): one_row = x[data_idx] one_cluster_tf_matrix[row_num] = one_row closest, _ = pairwise_distances_argmin_min([center_vec], one_cluster_tf_matrix) closest_idx_in_one_cluster_tf_matrix = closest[0] closest_data_row_num = data_idx_within_i_cluster[ closest_idx_in_one_cluster_tf_matrix] data_id = all_data[closest_data_row_num] closest_data.append(data_id) closest_data = list(set(closest_data)) detaille_article['id_cluster'] = k_means.labels_ entities = {} for k in detaille_article.groupby("id_cluster").count().id.nlargest( 20).index: for i in range(nombre_cluster): if (detaille_article.loc[closest_data[i], 'id_cluster'] == k): doc = nlp(detaille_article.loc[closest_data[i], 'title']) entity = "nothing" nombre_entity = 0 if not (doc.ents): doc = nlp(detaille_article.loc[closest_data[i], 'body']) for ent in doc.ents: if ((len(ent.text) > 2) & (ent.label_ not in [ 'DATE', 'TIME', 'CARDINAL', 'ORDINAL', 'PERCENT', 'QUANTITY' ])): if (detaille_article[detaille_article.loc[:, 'id_cluster'] == k].complet.str.contains( ent.text, flags=re.IGNORECASE, regex=True).sum() > nombre_entity): entity = ent.text nombre_entity = detaille_article[ detaille_article.loc[:, 'id_cluster'] == k].body.str.contains(ent.text, flags=re.IGNORECASE, regex=True).sum() if entity != 'nothing': entities[k] = entity detaille_article.sort_values("datePublished", axis=0, ascending=False, inplace=True) detaille_article.rename(columns={ 'image.url': 'thumbnail', 'provider.name': 'source' }, inplace=True) articles = detaille_article sujets = entities print(entities) json_records = detaille_article.reset_index().to_json(orient='records') data = [] data = json.loads(json_records) context = {'d': data, 'e': entities} return render(request, 'html2.html', context)
def update_graph(element_column, data_dict, selectedData, mode): if element_column is None: raise dash.exceptions.PreventUpdate if selectedData != None and mode == 'select-mode': df = pd.DataFrame.from_dict(data_dict, 'columns') x, y = vislogprob.logprob(df[element_column]) X = np.array([x,y]) visualizer = KElbowVisualizer(KMeans(), k=(1, 8)) visualizer.fit(X.transpose()) originalData = pd.DataFrame() originalData.insert(0, 'Relative Frequency (%)', x) originalData.insert(1, 'Value', y[::-1]) selected_x = [] for point in selectedData['points']: selected_x.append(point['x']) max_prob = np.max(selected_x)*0.01 originalData['Class'] = originalData.apply( lambda row: 'Anomalous Sample' if row['Relative Frequency (%)'] <= max_prob else 'Background Sample', axis=1) probgraf_fig = px.scatter(x=originalData['Relative Frequency (%)']*100, y=originalData.Value, color=originalData.Class, log_y=True, log_x=True, labels={'x':'Relative Frequency (%) ', 'y':str(element_column)+''}) probgraf_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h", legend=dict(x=-.1, y=1.2)) cluster_fig = px.line(x=visualizer.k_values_, y=visualizer.k_scores_, labels={'x':'Number of K clusters', 'y':'Distortion Score'}, range_y=[-5, np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_)/3]) cluster_fig.update_traces(mode="markers+lines", hovertemplate=None) cluster_fig.add_shape(dict(type='line', x0=visualizer.elbow_value_, y0=-np.mean(visualizer.k_scores_), x1=visualizer.elbow_value_, y1=np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_), line=dict(dash='dashdot', color='#EF553B'))) cluster_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h") merged_df = df.merge(originalData, left_on=element_column, right_on='Value') merged_df = merged_df.drop(axis=1, labels=['Value', 'Relative Frequency (%)']) merged_df.drop_duplicates(inplace=True) merged_df.sort_values(axis=0, by=element_column, inplace=True) cluster_columns = [{"name": i, "id": i} for i in merged_df.columns] return probgraf_fig, cluster_fig, merged_df.to_dict('records'), cluster_columns else: df = pd.DataFrame.from_dict(data_dict, 'columns') x, y = vislogprob.logprob(df[element_column]) X = np.array([x,y]) visualizer = KElbowVisualizer(KMeans(), k=(1, 8)) visualizer.fit(X.transpose()) df_clustered = vislogprob.clustered_df(X.transpose(), visualizer.elbow_value_) probgraf_fig = px.scatter(x=df_clustered['Relative Frequency (%)'], y=df_clustered.Value, color=df_clustered.Class, log_y=True, log_x=True, labels={'x':'Relative Frequency (%) ', 'y':str(element_column)+''}) probgraf_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h", legend=dict(x=-.1, y=1.2)) cluster_fig = px.line(x=visualizer.k_values_, y=visualizer.k_scores_, labels={'x':'Number of K clusters', 'y':'Distortion Score'}, range_y=[-5, np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_)/3]) cluster_fig.update_traces(mode="markers+lines", hovertemplate=None) cluster_fig.add_shape(dict(type='line', x0=visualizer.elbow_value_, y0=-np.mean(visualizer.k_scores_), x1=visualizer.elbow_value_, y1=np.max(visualizer.k_scores_)+np.mean(visualizer.k_scores_), line=dict(dash='dashdot', color='#EF553B'))) cluster_fig.update_layout(margin={'l': 10, 'b': 10, 't': 10, 'r': 10}, paper_bgcolor='#f9f9f9', legend_orientation="h") merged_df = df.merge(df_clustered, left_on=element_column, right_on='Value') merged_df = merged_df.drop(axis=1, labels=['Value', 'Relative Frequency (%)']) merged_df.drop_duplicates(inplace=True) merged_df.sort_values(axis=0, by=element_column, inplace=True) cluster_columns = [{"name": i, "id": i} for i in merged_df.columns] return probgraf_fig, cluster_fig, merged_df.to_dict('records'), cluster_columns
df['a'] = df['a'].astype(object) dummies = pd.get_dummies(df['a'], prefix='a') bcd = df.iloc[:, 2:5] min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(bcd) X_scaled = pd.DataFrame(x_scaled,columns=bcd.columns) X_scaled = pd.concat([X_scaled,dummies], axis=1,) # Elbow method 手肘法 1 plt.figure(figsize=(12,9)) model = KMeans() visualizer = KElbowVisualizer(model, k=(1,5)) visualizer.fit(X_scaled) visualizer.show() # Elbow method 手肘法 2 SSE = [] # 存放每次结果的误差平方和 for k in range(1,5): estimator = KMeans(n_clusters=k) # 构造聚类器 estimator.fit(X_scaled) SSE.append(estimator.inertia_) # estimator.inertia_获取聚类准则的总和 X = range(1,5) plt.xlabel('k') plt.ylabel('SSE') plt.plot(X,SSE,'o-') plt.show() model=MiniBatchKMeans(n_clusters=2)
np.random.seed(5) X = np.array(read_csv('lab2.xlsx', 0)) # Y = np.array([i for i in range(1, 57)]) # normalize the data attributes normalized_X = MinMaxScaler().fit_transform(X) pca = PCA(n_components=2) pca.fit(normalized_X) pca_X = pca.transform(normalized_X) print("pca_X") print(pca_X) model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 12)) visualizer.fit(pca_X) # visualizer.show() kmeans = KMeans(n_clusters=visualizer.elbow_value_) kmeans.fit(pca_X) y_kmeans = kmeans.predict(pca_X) print(len(y_kmeans)) plt.subplot(121) plt.scatter(pca_X[:, 0], pca_X[:, 1], c=y_kmeans, cmap="viridis") plt.title("K-means") # plt.show() # kmeans = KMeans(pca_X) # kmeans.run() # kmeans.plot(column_1_number=0, column_2_number=1)
#print(df.head(10)) for k in range(2, 15): kmeans = cluster.KMeans(n_clusters=k) kmeans.fit(x_scaled) clusters = kmeans.cluster_centers_ #print clusters #print(clusters) y_km = kmeans.fit_predict(x_scaled) unique, counts = np.unique(y_km, return_counts=True) print(f"Cluster counts for k = {k}: ", dict(zip(unique, counts))) model = KMeans() visualizer = KElbowVisualizer(model, k=(2, 15)) visualizer.fit(x_scaled) visualizer.show() visualizer2 = KElbowVisualizer(model, k=(2, 15), metric='calinski_harabasz', timings=False) visualizer2.fit(x_scaled) visualizer2.show() #print(kmeans) #labels = kmeans.predict(df) #print(labels) #df_final["cluster"] = labels.tolist() #print(df_final)
df.head() print(df.head()) #separate out data based on per class basis a = df.loc[df['Label'] !='Label'] print("printing Classes from the CSV\n") print(a['Label'].unique().tolist()) subset1 = df[df.Label == 'MalwareActivity'] subset2 = df.loc[df['Label'] == 'MalwareAttack'] subset3 = df.loc[df['Label'] == 'Benign'] print("printing subsets") labellist = [subset1, subset2, subset3] for ele, category in zip(labellist, list_classes): print(ele) series = ele.iloc[:, 0:4].values my_title = "Elbow Method for K-means clustering for {}".format(category) model = KMeans() visualizer = KElbowVisualizer(model, k=(1,10), title=my_title) visualizer.fit(series) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure
# IU - International University of Applied Science # Machine Learning - Unsupervised Machine Learning # Course Code: DLBDSMLUSL01 # Elbow criterion #%% import libraries import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans from yellowbrick.cluster import KElbowVisualizer #%% create sample data X= np.random.rand(50,2) Y= 2 + np.random.rand(50,2) Z= np.concatenate((X,Y)) #%% create a k-Means model an Elbow-Visualizer model = KMeans() visualizer = KElbowVisualizer(model, k=(1,8), \ timings=True) #%% fit the visualizer and show the plot visualizer.fit(Z) visualizer.show()
# Clustering Evaluation Imports from functools import partial from sklearn.cluster import MiniBatchKMeans from sklearn.datasets import make_blobs as sk_make_blobs from yellowbrick.cluster import KElbowVisualizer # Helpers for easy dataset creation N_SAMPLES = 1000 N_FEATURES = 12 SHUFFLE = True # Make blobs partial make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE) if __name__ == '__main__': # Make 8 blobs dataset X, y = make_blobs(centers=8) # Instantiate the clustering model and visualizer # Instantiate the clustering model and visualizer visualizer = KElbowVisualizer(MiniBatchKMeans(), k=(4, 12)) visualizer.fit(X) # Fit the training data to the visualizer visualizer.poof(outpath="images/elbow.png") # Draw/show/poof the data
from kmodes.kmodes import KModes import matplotlib.pyplot as plt from yellowbrick.cluster import KElbowVisualizer # prepare dataset filling null values with 0 df = pd.read_excel("Deloitte Team 1 Debtor segmentation.xlsx") df = df.fillna(0) df_copy = df.copy() df5 = pd.DataFrame(df, columns=[ 'Amount Due', 'Active Bucket', '30<60', '61<90', '91<120', '121+' ]) model = KModes() visualizer = KElbowVisualizer(model, k=(1, 7)) visualizer.fit(df5) # Fit the data to the visualizer visualizer.show() # In[28]: # binning by manually dividing the scale of amount into buckets and attaching labels to each bucket df['Amount Due bin'] = pd.cut(df['Amount Due'], [-1, 0, 1000, 3000, 5000, 10000, 50000, 1000000], labels=[ '0', '1-1000', '1000-3000', '3000-5000', '5000-10000', '10000-50000', '50000-1000000' ]) df = df.drop('Amount Due', axis=1) df['Active Bucket bin'] = pd.cut(df['Active Bucket'], [-1, 0, 1000, 3000, 5000, 10000, 20000],
import pandas as pd from sklearn.cluster import KMeans from yellowbrick.cluster import KElbowVisualizer from matplotlib import pyplot as plt data = pd.read_csv("ClusterPlot.csv") model = KMeans() elbow_visualizer = KElbowVisualizer(model, k=(1, 10), timings=False) elbow_visualizer.fit(data) print("Number of Clusters: ", elbow_visualizer.elbow_value_) elbow_visualizer.show() x = data.copy() kmeans = KMeans(elbow_visualizer.elbow_value_) kmeans.fit(x) clusters = x.copy() clusters["cluster_pred"] = kmeans.fit_predict(x) plt.scatter(data["V1"], data["V2"], c=clusters["cluster_pred"], cmap="rainbow") plt.xlabel("V1") plt.ylabel("V2") plt.show()
def explore_KMeans_clustering( df, num_cols=None, n_clusters=range(3, 5), include_silhouette=True, include_PCA=True, random_state=None, ): """create, fit and plot KMeans clustering on the dataset Parameters ---------- df : pandas.DataFrame the dataset, should be transformed with StandardScaler num_cols : list, optional list of numeric column names, in case of None, get all numeric columns metric : str, optional metric, by default "euclidean" n_clusters : list, optional list of n_clusters hyperparams, by default range(2, 9) include_silhouette : bool, optional whether Silhouette plots should be generated, by default True include_PCA : bool, optional whether PCA plots should be generated, by default True random_state : int, optional a number determines random number generation for centroid initialization, by default None Returns ------- dict a dictionary with key=type of plot, value=list of plots Examples ------- >>> original_df = pd.read_csv("/data/menu.csv") >>> numeric_features = eda.get_numeric_columns(original_df) >>> numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) >>> preprocessor = make_column_transformer( >>> (numeric_transformer, numeric_features) >>> ) >>> df = pd.DataFrame( >>> data=preprocessor.fit_transform(original_df), columns=numeric_features >>> ) >>> explore_KMeans_clusterting(df) """ if num_cols is None: num_cols = get_numeric_columns(df) else: _verify_numeric_cols(df, num_cols) x = df[num_cols] results = {} if 1 in n_clusters: raise Exception("n_cluster cannot be 1") print("------------------------") print("K-MEANS CLUSTERING") print("------------------------") if len(n_clusters) > 1: print("Generating KElbow plot for KMeans.") # visualize using KElbowVisualizer kmeans = KMeans(random_state=random_state) plt.clf() fig, ax = plt.subplots() elbow_visualizer = KElbowVisualizer(kmeans, k=n_clusters, ax=ax) elbow_visualizer.fit(x) # Fit the data to the visualizer elbow_visualizer.show() plt.close() elbow_visualizer.k = elbow_visualizer.elbow_value_ # fix printing issue results["KElbow"] = fig else: results["KElbow"] = None # visualize using SilhouetteVisualizer print("Generating Silhouette & PCA plots") silhouette_plots = [] pca_plots = [] for k in n_clusters: print(f"Number of clusters: {k}") kmeans = KMeans(k, random_state=random_state) if include_silhouette: fig, ax = plt.subplots() s_visualizer = SilhouetteVisualizer(kmeans, colors="yellowbrick", ax=ax) s_visualizer.fit(x) # Fit the data to the visualizer s_visualizer.show() silhouette_plots.append(fig) # plt.clf() plt.close() else: silhouette_plots.append(None) # PCA plots if include_PCA: labels = kmeans.fit_predict(x) pca_fig = plot_pca_clusters(x, labels, random_state=random_state) pca_plots.append(pca_fig) else: pca_plots.append(None) results["Silhouette"] = silhouette_plots results["PCA"] = pca_plots return results
dataset.describe(include="all") features = dataset.iloc[:, 0:7] target = dataset.iloc[:, -1] ''' print('----- features') print(features) print('----- target') print(target) exit() ''' model = KMeans() visualizer = KElbowVisualizer(model, k=(1, 10)) visualizer.fit(features) # Fit the data to the visualizer visualizer.poof() # Draw/show/poof the data kmeans = KMeans(n_clusters=3) kmeans.fit(features) cluster_labels = kmeans.fit_predict(features) kmeans.cluster_centers_ silhouette_avg = metrics.silhouette_score(features, cluster_labels) print('silhouette coefficient for the above clutering = ', silhouette_avg) def purity_score(y_true, y_pred): # compute contingency matrix (also called confusion matrix) contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
def elbow(): X, _ = make_blobs(centers=8, n_features=12, shuffle=True) oz = KElbowVisualizer(KMeans(), k=(4, 12), ax=newfig()) oz.fit(X) savefig(oz, "elbow")