def em(tx, ty, rx, ry, add="", times=5): errs = [] # this is what we will compare to checker = EM(n_components=2) checker.fit(ry) truth = checker.predict(ry) # so we do this a bunch of times for i in range(2, times): clusters = {x: [] for x in range(i)} # create a clusterer clf = EM(n_components=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set # here we make the arguably awful assumption that for a given cluster, # all values in tha cluster "should" in a perfect world, belong in one # class or the other, meaning that say, cluster "3" should really be # all 0s in our truth, or all 1s there # # So clusters is a dict of lists, where each list contains all items # in a single cluster for index, val in enumerate(result): clusters[val].append(index) # then we take each cluster, find the sum of that clusters counterparts # in our "truth" and round that to find out if that cluster should be # a 1 or a 0 mapper = { x: round( sum(truth[v] for v in clusters[x]) / float(len(clusters[x]))) if clusters[x] else 0 for x in range(i) } # the processed list holds the results of this, so if cluster 3 was # found to be of value 1, # for each value in clusters[3], processed[value] == 1 would hold processed = [mapper[val] for val in result] errs.append(sum((processed - truth)**2) / float(len(ry))) plot([0, times, min(errs) - .1, max(errs) + .1], [range(2, times), errs, "ro"], "Number of Clusters", "Error Rate", "Expectation Maximization Error", "EM" + add) # dank magic, wrap an array cuz reasons td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) newtx = np.append(tx, td, 1) newrx = np.append(rx, rd, 1) nn(newtx, ty, newrx, ry, add="onEM" + add)
def oneem(tx, ty, rx, ry, add="", times=5): scores = [] clf = EM(n_components=times) clf.fit(tx) #fit it to our data scores.append(clf.predict_proba(tx)) scores.append(clf.predict_proba(rx)) return scores
def part4_mnist(): mnist = input_data.read_data_sets("data/") X = mnist.train.images y = mnist.train.labels # One cluster for each digit k = 10 # Run EM algorithm on 1000 images from the MNIST dataset. expectation_maximization = EM(n_components=k, max_iter=10, init_params='kmeans', covariance_type='diag', verbose=1, verbose_interval=1).fit(X) means = expectation_maximization.means_ covs = expectation_maximization.covariances_ fig, ax = plt.subplots(1, k, figsize=(8, 1)) for i in range(k): ax[i].imshow(means[i].reshape(28, 28), cmap='gray') plt.show() sample(means, covs, 0)
def myem(X, y, nameappendix, krange): for n_clusters in krange: fig = plt.gcf() # fig.set_size_inches(7, 7) ax = fig.add_subplot(111) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = EM(n_components=n_clusters, random_state=10).fit(X) labels = clusterer.predict(X) print("NMI score: %.6f" % normalized_mutual_info_score(y, labels)) # 2nd Plot showing the actual clusters formed # colors = cm.spectral(cluster_labels.astype(float) / n_clusters) colors = plt.get_cmap('Spectral')(labels.astype(float) / n_clusters) plt.scatter(X[:, 3], X[:, 5], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') # Labeling the clusters centers = clusterer.means_ # Draw white circles at cluster centers plt.scatter(centers[:, 3], centers[:, 5], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax.scatter( c[3], c[5], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax.set_title("Clustering Visualization") ax.set_xlabel("1st feature: Pressure X4") ax.set_ylabel("2nd feature: Pressure X5") plt.suptitle("Analysis for EM Clustering for " + str(n_clusters) + " Clusters", fontsize=14, fontweight='bold') plt.savefig('img/em' + nameappendix + '.png') plt.show()
def em_experiment(X, y, title, folder=""): cluster_range = list(np.arange(2, 11, 1)) sil_scores, accuracy_scores, homo_scores, sse_scores, ami_scores, bic_scores = ( [] for i in range(6)) completeness_scores = [] for k in cluster_range: # print(k) em = EM(n_components=k).fit(X) em_labels = em.predict(X) sil_scores.append(sil_score(X, em_labels)) sse_scores.append(em.score(X)) # print(sil_score(X,em_labels)) homo_scores.append(homogeneity_score(y, em_labels)) completeness_scores.append(completeness_score(y, em_labels)) ami_scores.append(adjusted_mutual_info_score(y, em_labels)) bic_scores.append(em.bic(X)) plt.plot(cluster_range, sil_scores) plt.xlabel('No. Components') plt.ylabel('Avg Silhouette Score') plt.title('Silhouette Score for EM: ' + title) plt.savefig(folder + '/EMSIL.png') plt.close() plt.plot(cluster_range, homo_scores) plt.xlabel('No. Components') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores EM: ' + title) plt.savefig(folder + '/EMHOMOGENEITY.png') plt.close() plt.plot(cluster_range, completeness_scores) plt.xlabel('No. Components') plt.ylabel('Completeness Score') plt.title('Completeness Score for EM: ' + title) plt.savefig(folder + '/EMCompletness.png') plt.close() plt.plot(cluster_range, sse_scores) plt.xlabel('No. Components') plt.ylabel('SSE Score') plt.title('SSE Scores EM: ' + title) plt.savefig(folder + '/EMSSE.png') plt.close() plt.plot(cluster_range, ami_scores) plt.xlabel('No. Components') plt.ylabel('AMI Score') plt.title('Adjusted Mutual Information Scores EM: ' + title) plt.savefig(folder + '/EMAMI.png') plt.close() plt.plot(cluster_range, bic_scores) plt.xlabel('No. Components') plt.ylabel('AMI Score') plt.title('BIC Scores EM: ' + title) plt.savefig(folder + '/EMBIC.png') plt.close()
def run_EM(X,y,title): kdist = list(np.arange(2,100,5)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) y_mode_vote = cluster_predictions(y,labels) f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Silhouette') plt.title(title + ' Exp Max Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Silhouette') plt.title(title + ' Exp Max Silhouette') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, f1_scores) plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('F1 Score') plt.title(title + 'Exp Max F1') plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Model Complexity Score') plt.title(title + 'Exp Max Model Complexity') plt.legend(loc="best") plt.show()
def credit_risk_data(): data_X = credit_data.drop([ 'credit_amount', 'other_parties', 'purpose', 'own_telephone', 'foreign_worker' ], axis=1) data_y = credit_data[['class']] features_to_encode = [ 'personal_status', 'checking_status', 'credit_history', 'savings_status', 'employment', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'class' ] enc = my_encoder() enc.fit(data_X, features_to_encode) X_train = enc.transform(data_X) # X_test = enc.transform(X_test) run_PCA(X_train, "Credit Data") run_ICA(X_train, "Credit Data") run_RCA(X_train, "Credit Data") pca_credit = PCA(n_components=3, random_state=5).fit_transform(X_train) ica_credit = ICA(n_components=2, random_state=5).fit_transform(X_train) rca_credit = RCA(n_components=29, random_state=5).fit_transform(X_train) run_kmeans(pca_credit, X_train, "KMEANS") run_kmeans(ica_credit, X_train, "KMEANS") run_kmeans(rca_credit, X_train, "KMEANS") run_EM(pca_credit, X_train, 'PCA Credit Risk Data') run_EM(ica_credit, X_train, 'ICA Credit Risk Data') run_EM(rca_credit, X_train, 'RCA Credit Risk Data') km = KMeans(n_clusters=3, random_state=0) y_km = km.fit_predict(X_train) score = silhouette_score(X_train, km.labels_, metric='euclidean') print('Silhouetter Score: %.3f' % score) # kmeans_silhoutte_analysis(X_train) elbow_function(X_train) run_kmeans(X_train, y_km, "KMEANS") em = EM(n_components=2, covariance_type='spherical', random_state=100) y_em = em.fit_predict(X_train) plot_EM(em, X_train) run_EM(X_train, y_em, "EM") # evaluate_EM(em, X_train, y_em) X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=0)
def chess_game_data(): data_X = game_data.drop([ 'id', 'created_at', 'increment_code', 'black_id', 'white_id', 'moves' ], axis=1) data_y = game_data[['winner']] gd = data_X[:1000] features_to_encode = [ 'rated', 'victory_status', 'winner', 'opening_eco', 'opening_name' ] enc = my_encoder() enc.fit(gd, features_to_encode) X_train = enc.transform(gd) # X_test = enc.transform(X_test) run_PCA(X_train, "Chess Data") run_ICA(X_train, "Chess Data") run_RCA(X_train, "Chess Data") pca_chess = PCA(random_state=5).fit_transform(X_train) # ica_chess = ICA(random_state=5).fit_transform(X_train) rca_chess = RCA(n_components=60, random_state=5).fit_transform(X_train) run_kmeans(pca_chess, X_train, "KMEANS") # run_kmeans(ica_chess, X_train, "KMEANS") run_kmeans(rca_chess, X_train, "KMEANS") run_EM(pca_chess, X_train, 'PCA Chess Game Data') # run_EM(ica_chess, X_train, 'ICA Chess Game Data') run_EM(rca_chess, X_train, 'RCA Chess Game Data') km = KMeans(n_clusters=3, random_state=0) y_km = km.fit_predict(X_train) score = silhouette_score(X_train, km.labels_, metric='euclidean') print('Silhouetter Score: %.3f' % score) # kmeans_silhoutte_analysis(X_train) run_kmeans(X_train, y_km, "KMEANS") elbow_function(X_train) em = EM(n_components=4, covariance_type='spherical', random_state=100) y_em = em.fit_predict(X_train) plot_EM(em, X_train) run_EM(X_train, y_em, "EM") X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=0)
def em_analysis(): X_p, Y_p, _ = get_phishing_data() # run_EM(X_p, Y_p, 'Phishing Data') em = EM(n_components=30, covariance_type='diag', n_init=1, warm_start=True, random_state=100) evaluate_EM(em, X_p, Y_p) df = pd.DataFrame(em.means_) df.to_csv("Phishing EM Component Means.csv") X_v, Y_v, _ = get_vocal_data() # run_EM(X_v, Y_v, 'Vocal Data') em = EM(n_components=52, covariance_type='diag', n_init=1, warm_start=True, random_state=100) evaluate_EM(em, X_v, Y_v) df = pd.DataFrame(em.means_) df.to_csv("Vocal EM Component Means.csv")
def main(): df = pd.read_csv("../Dataset/winequality-white.csv", delimiter=";") seed = 200 np.random.seed(seed) lowquality = df.loc[df['quality'] <= 6].index highquality = df.loc[df['quality'] > 6].index df.iloc[lowquality, df.columns.get_loc('quality')] = 0 df.iloc[highquality, df.columns.get_loc('quality')] = 1 X = np.array(df.iloc[:, 0:-1]) wine_Y = np.array(df.iloc[:, -1]) standardScalerX = StandardScaler() wine_x = standardScalerX.fit_transform(X) km = KMeans(n_clusters=2, random_state=200).fit(wine_x) km_labels = km.labels_ df['KM'] = km_labels df = df.drop(columns='quality') knn_X = np.array(df.values, dtype='int64') X_train, X_test, y_train, y_test = train_test_split(np.array(knn_X), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="Kmeans.png") em = EM(n_components=2, random_state=200).fit(wine_x) em_labels = em.predict(wine_x) df['EM'] = em_labels df = df.drop(columns='KM') em_X = np.array(df.values, dtype='int64') X_train, X_test, y_train, y_test = train_test_split(np.array(em_X), np.array(wine_Y), test_size=0.30) learner = MLPClassifier(hidden_layer_sizes=(22, ), activation='relu', learning_rate_init=0.0051, random_state=seed) evaluate(learner, X_train, X_test, y_train, y_test, title="EM.png")
def run_EM(X, y, title): kdist = list(np.arange(2, 100, 5)) sil_scores = [] train_times = [] aic_scores = [] bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k, covariance_type='spherical', random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) # y_mode_vote = cluster_predictions(y, labels) # f1_scores.append(f1_score(y, y_mode_vote)) # homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Avg Silhouette Score') plt.title('Silhouette Analysis for EM: ' + title) plt.show() # plot model AIC and BIC fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores, label='BIC') plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Model Complexity Score') plt.title('EM Model Complexity: ' + title) plt.legend(loc="best") plt.show()
def plotIt(): iris = sklearn.datasets.load_iris() X = iris['data'][:, 0:2] # reduce dimensions so we can plot what happens. k = 3 means, covs, priors, llh = em_algorithm(X, 3, 100, 0.001) fig, ax = plt.subplots(1, 1, figsize=(8,3)) llhs = [] for i in range(1): _, _, _, llh = em_algorithm(X, 3, 100) llhs.append(llh) ax.plot(llhs, 'bx') fig.canvas.draw() #plt.show() expectation_maximization = EM(n_components=3, init_params='random', covariance_type='diag', verbose=2, verbose_interval =1).fit(X)
def run_EM(X,y,title): #kdist = [2,3,4,5] #kdist = list(range(2,51)) kdist = list(np.arange(2,20,2)) sil_scores = []; f1_scores = []; homo_scores = []; train_times = []; aic_scores = []; bic_scores = [] for k in kdist: start_time = timeit.default_timer() em = EM(n_components=k,covariance_type='diag',n_init=1,warm_start=True,random_state=100).fit(X) end_time = timeit.default_timer() train_times.append(end_time - start_time) labels = em.predict(X) sil_scores.append(sil_score(X, labels)) y_mode_vote = cluster_predictions(y,labels) f1_scores.append(f1_score(y, y_mode_vote)) homo_scores.append(homogeneity_score(y, labels)) aic_scores.append(em.aic(X)) bic_scores.append(em.bic(X)) # elbow curve for silhouette score fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, sil_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Avg Silhouette Score') plt.title('Elbow Plot for EM: '+ title) plt.show() # plot homogeneity scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, homo_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Homogeneity Score') plt.title('Homogeneity Scores EM: '+ title) plt.show() # plot f1 scores fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, f1_scores) plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('F1 Score') plt.title('F1 Scores EM: '+ title) plt.show() # plot model AIC and BIC fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('No. Distributions') plt.ylabel('Model Complexity Score') plt.title('EM Model Complexity: '+ title) plt.legend(loc="best") plt.show()
print("Log-likelihood Lower Bound: {:.2f}".format(em.lower_bound_)) print("F1 Score: "+"{:.2f}".format(f1)) print("Accuracy: "+"{:.2f}".format(accuracy)+" AUC: "+"{:.2f}".format(auc)) print("Precision: "+"{:.2f}".format(precision)+" Recall: "+"{:.2f}".format(recall)) print("*****************************************************") plt.figure() plot_confusion_matrix(cm, classes=["0","1"], title='Confusion Matrix') plt.show() # In[67]: ttX,ttY,bankX,bankY = import_data() run_EM(ttX,ttY,'Titanic Data') em = EM(n_components=24,covariance_type='diag',n_init=1,warm_start=True,random_state=100) evaluate_EM(em,ttX,ttY) df = pd.DataFrame(em.means_) df.to_csv("Titanic EM Component Means.csv") # In[37]: ttX,ttY,bankX,bankY = import_data() X_train, X_test, y_train, y_test = train_test_split(np.array(bankX),np.array(bankY), test_size=0.25) run_EM(X_train,y_train,'Banking Data') em = EM(n_components=41,covariance_type='diag',n_init=1,warm_start=True,random_state=100) evaluate_EM(em,bankX,bankY) df = pd.DataFrame(em.means_) df.to_csv("Banking EM Component Means.csv")
def main(): seed = 200 df = pd.read_csv("../Dataset/winequality-white.csv", delimiter=";") np.random.seed(seed) #####load wine data lowquality = df.loc[df['quality'] <= 6].index highquality = df.loc[df['quality'] > 6].index df.iloc[lowquality, df.columns.get_loc('quality')] = 0 df.iloc[highquality, df.columns.get_loc('quality')] = 1 X = np.array(df.iloc[:, 0:-1]) wine_Y = np.array(df.iloc[:, -1]) standardScalerX = StandardScaler() wine_x = standardScalerX.fit_transform(X) #####run k means to find best kmeans_experiment(wine_x, wine_Y, 'Wine Data', folder="part1_wineplots") # Plot Kmeans Wine Cluster reduced_data = PCA(n_components=2).fit_transform(wine_x) kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Only for kmeans # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-means clustering on the wine dataset') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.savefig('part1_wineplots/kmeans_cluster.png') plt.close() ###end plot ######run em to find best components em_experiment(wine_x, wine_Y, 'Wine Data', folder="part1_wineplots") # Plot EM wine Cluster reduced_data = PCA(n_components=2).fit_transform(wine_x) kmeans = EM(n_components=2, n_init=10) kmeans.fit(reduced_data) h = .02 x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) plt.title('EM clustering on the wine dataset') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.savefig('part1_wineplots/em_cluster.png') plt.close() ####Load digits df_digits = pd.read_csv("../Dataset/pendigits.csv", header=None) np.random.seed(seed) X = np.array(df_digits.iloc[:, 0:-1]) Y = np.array(df_digits.iloc[:, -1]) standardScalerX = StandardScaler() digits_x = standardScalerX.fit_transform(X) # #####run k means to find best kmeans_experiment(digits_x, Y, 'Digits Data', folder="part1_digitsplots") #Plot Kmeans Digit Cluster reduced_data = PCA(n_components=2).fit_transform(digits_x) kmeans = KMeans(init='k-means++', n_clusters=9, n_init=10) kmeans.fit(reduced_data) h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title('K-means clustering on the digits dataset') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.savefig('part1_digitsplots/kmeans_cluster.png') plt.close() ######run em to find best components em_experiment(digits_x, Y, 'Digits Data', folder="part1_digitsplots") # Plot EM Digit Cluster reduced_data = PCA(n_components=2).fit_transform(digits_x) kmeans = EM(n_components=8, n_init=10) kmeans.fit(reduced_data) h = .02 x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) plt.title('EM clustering on the digits dataset') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.savefig('part1_digitsplots/em_cluster.png') plt.close()
"{:.2f}".format(auc)) print("Precision: " + "{:.2f}".format(precision) + " Recall: " + "{:.2f}".format(recall)) print("*****************************************************") plt.figure() plot_confusion_matrix(cm, classes=["0", "1"], title='Confusion Matrix') plt.show() # In[ ]: phishX, phishY, bankX, bankY = import_data() run_EM(phishX, phishY, 'Phishing Data') em = EM(n_components=24, covariance_type='diag', n_init=1, warm_start=True, random_state=100) evaluate_EM(em, phishX, phishY) df = pd.DataFrame(em.means_) df.to_csv("Phishing EM Component Means.csv") # In[ ]: phishX, phishY, bankX, bankY = import_data() X_train, X_test, y_train, y_test = train_test_split(np.array(bankX), np.array(bankY), test_size=0.25) run_EM(X_train, y_train, 'Banking Data') em = EM(n_components=41, covariance_type='diag',
from tensorflow.examples.tutorials.mnist import input_data from scipy.stats import multivariate_normal import numpy as np mnist = input_data.read_data_sets("data/") X = mnist.train.images y = mnist.train.labels # One cluster for each digit k = 10 # Run EM algorithm on 1000 images from the MNIST dataset. expectation_maximization = EM(n_components=k, max_iter=10, init_params='kmeans', covariance_type='diag', verbose=1, verbose_interval=1).fit(X) means = expectation_maximization.means_ covs = expectation_maximization.covariances_ fig, ax = plt.subplots(1, k, figsize=(8, 1)) for i in range(k): ax[i].imshow(means[i].reshape(28, 28), cmap='gray') plt.show() def sample(means, covs, num):
def em(x, n_classes, min_iterations=25): em = EM(n_components=n_classes, n_init=25).fit(x) return em.predict(x)
def em(tx, ty, rx, ry, reduced_data, add="", times=5, dataset="", alg=""): clf = EM(n_components=times) clf.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # centroids = clf.cluster_centers_ # plt.scatter(centroids[:, 0], centroids[:, 1], # marker='x', s=169, linewidths=3, # color='w', zorder=10) plt.title(dataset + ': EM clustering (' + alg + '-reduced data)') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() clf = EM(n_components=times) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) checker = EM(n_components=times) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) td = np.reshape(test, (test.size, 1)) rd = np.reshape(result, (result.size, 1)) # newtx = np.append(td) # newrx = np.append(rd) myNN(test, ty, result, ry, alg="EM_"+alg) errs = [] scores = [] # this is what we will compare to checker = EM(n_components=2) ry = ry.reshape(-1,1) checker.fit(ry) truth = checker.predict(ry) adj_rand = [] v_meas = [] mutual_info = [] adj_mutual_info = [] # so we do this a bunch of times for i in range(2,times): clusters = {x:[] for x in range(i)} # create a clusterer clf = EM(n_components=i) clf.fit(tx) #fit it to our data test = clf.predict(tx) result = clf.predict(rx) # and test it on the testing set for index, val in enumerate(result): clusters[val].append(index) mapper = {x: round(sum(truth[v] for v in clusters[x])/float(len(clusters[x]))) if clusters[x] else 0 for x in range(i)} processed = [mapper[val] for val in result] errs.append(sum((processed-truth)**2) / float(len(ry))) scores.append(clf.score(tx, ty)) adj_rand.append(metrics.adjusted_rand_score(ry.ravel(), result)) v_meas.append(metrics.v_measure_score(ry.ravel(), result)) mutual_info.append(metrics.fowlkes_mallows_score(ry.ravel(), result)) adj_mutual_info.append(metrics.homogeneity_score(ry.ravel(), result)) # plot([0, times, min(scores)-.1, max(scores)+.1],[range(2, times), scores, "-"], "Number of Clusters", "Log Likelihood", dataset+": EM Log Likelihood - " + alg, dataset+"_EM_"+alg) # other metrics # names = ["Adjusted Random", "V Measure", "Mutual Info", "Adjusted Mutual Info"] plt.figure() plt.title(dataset+": EM Clustering measures - "+alg) plt.xlabel('Number of clusters') plt.ylabel('Score value') plt.plot(range(2,times),adj_rand, label="Adjusted Random") plt.plot(range(2,times),v_meas, label="V Measure") plt.plot(range(2,times),mutual_info, label = "Fowlkes Mallows Score") plt.plot(range(2,times),adj_mutual_info, label="Homogeneity Score") plt.legend() plt.savefig("EMMetrics"+dataset+"_"+alg+".png") kmeans = KM(n_clusters=2) kmeans.fit(reduced_data) Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # Obtain labels for each point in mesh. Use last trained model. Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure() plt.clf() plt.imshow(Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower') plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) # Plot the centroids as a white X centroids = kmeans.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=169, linewidths=3, color='w', zorder=10) plt.title(dataset + ': EM clustering (' + alg + '-reduced data)\n' 'Centroids are marked with a white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()
def dimensionality_reduction_analysis(): X_p, Y_p, df_phish = get_phishing_data() run_PCA(X_p, Y_p, "Phishing Data") run_ICA(X_p, Y_p, "Phishing Data") run_RCA(X_p, Y_p, "Phishing Data") imp_phish, topcols_phish = run_RFC(X_p, Y_p, df_original=df_phish) pca_phish = PCA(n_components=32, random_state=5).fit_transform(X_p) ica_phish = ICA(n_components=32, random_state=5).fit_transform(X_p) rca_phish = RCA(n_components=32, random_state=5).fit_transform(X_p) rfc_phish = df_phish[topcols_phish] rfc_phish = np.array(rfc_phish.values, dtype='int64')[:, :32] # run_kmeans(pca_phish, Y_p, 'PCA Phishing Data') run_kmeans(ica_phish, Y_p, 'ICA Phishing Data') run_kmeans(rca_phish, Y_p, 'RCA Phishing Data') run_kmeans(rfc_phish, Y_p, 'RFC Phishing Data') evaluate_kmeans(KMeans(n_clusters=14, n_init=10, random_state=100, n_jobs=-1), pca_phish, Y_p, title="PCA") evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), ica_phish, Y_p, title="ICA") evaluate_kmeans(KMeans(n_clusters=10, n_init=10, random_state=100, n_jobs=-1), rca_phish, Y_p, title="RCA") evaluate_kmeans(KMeans(n_clusters=2, n_init=10, random_state=100, n_jobs=-1), rfc_phish, Y_p, title="RFC") run_EM(pca_phish, Y_p, 'PCA Phishing Data') run_EM(ica_phish, Y_p, 'ICA Phishing Data') run_EM(rca_phish, Y_p, 'RCA Phishing Data') run_EM(rfc_phish, Y_p, 'RFC Phishing Data') evaluate_EM(EM(n_components=67, covariance_type='diag', n_init=1, warm_start=True, random_state=100), pca_phish, Y_p, title="PCA") evaluate_EM(EM(n_components=64, covariance_type='diag', n_init=1, warm_start=True, random_state=100), ica_phish, Y_p, title="ICA") evaluate_EM(EM(n_components=64, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rca_phish, Y_p, title="RCA") evaluate_EM(EM(n_components=32, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rfc_phish, Y_p, title="RFC") X_v, Y_v, df_vocal = get_vocal_data() run_PCA(X_v, Y_v, "Phone Me Data") run_ICA(X_v, Y_v, "Phone Me Data") run_RCA(X_v, Y_v, "Phone Me Data") imp_vocal, topcols_vocal = run_RFC(X_v, Y_v, df_original=df_vocal) pca_vocal = PCA(n_components=4, random_state=5).fit_transform(X_v) ica_vocal = ICA(n_components=4, random_state=5).fit_transform(X_v) rca_vocal = RCA(n_components=4, random_state=5).fit_transform(X_v) rfc_vocal = df_vocal[topcols_vocal] rfc_vocal = np.array(rfc_vocal.values, dtype='int64')[:, :4] run_kmeans(pca_vocal, Y_v, 'PCA Phone Me Data') run_kmeans(ica_vocal, Y_v, 'ICA Phone Me Data') run_kmeans(rca_vocal, Y_v, 'RCA Phone Me Data') run_kmeans(rfc_vocal, Y_v, 'RFC Phone Me Data') evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), pca_vocal, Y_v, title="PCA") evaluate_kmeans(KMeans(n_clusters=10, n_init=10, random_state=100, n_jobs=-1), ica_vocal, Y_v, title="ICA") evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), rca_vocal, Y_v, title="RCA") evaluate_kmeans(KMeans(n_clusters=12, n_init=10, random_state=100, n_jobs=-1), rfc_vocal, Y_v, title="RFC") run_EM(pca_vocal, Y_v, 'PCA Phone Me Data') run_EM(ica_vocal, Y_v, 'ICA Phone Me Data') run_EM(rca_vocal, Y_v, 'RCA Phone Me Data') run_EM(rfc_vocal, Y_v, 'RFC Phone Me Data') evaluate_EM(EM(n_components=58, covariance_type='diag', n_init=1, warm_start=True, random_state=100), pca_vocal, Y_v, title="PCA") evaluate_EM(EM(n_components=52, covariance_type='diag', n_init=1, warm_start=True, random_state=100), ica_vocal, Y_v, title="ICA") evaluate_EM(EM(n_components=56, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rca_vocal, Y_v, title="RCA") evaluate_EM(EM(n_components=48, covariance_type='diag', n_init=1, warm_start=True, random_state=100), rfc_vocal, Y_v, title="RFC") # Comparing With NN # Original print("Original") X_train, X_test, y_train, y_test = train_test_split(np.array(X_p), np.array(Y_p), test_size=0.20) full_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve( full_est, X_train, y_train, title="Neural Net Phishing: Full") final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test) # PCA print("PCA") X_train, X_test, y_train, y_test = train_test_split(np.array(pca_phish), np.array(Y_p), test_size=0.20) pca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve( pca_est, X_train, y_train, title="Neural Net Phishing: PCA") final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test) # ICA print("ICA") X_train, X_test, y_train, y_test = train_test_split(np.array(ica_phish), np.array(Y_p), test_size=0.20) ica_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve( ica_est, X_train, y_train, title="Neural Net Phishing: ICA") final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test) # Randomised Projection print("RCA") X_train, X_test, y_train, y_test = train_test_split(np.array(rca_phish), np.array(Y_p), test_size=0.20) rca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve( rca_est, X_train, y_train, title="Neural Net Phishing: RCA") final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test) # RFC print("RFC") X_train, X_test, y_train, y_test = train_test_split(np.array(rfc_phish), np.array(Y_p), test_size=0.20) rfc_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve( rfc_est, X_train, y_train, title="Neural Net Phishing: RFC") final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test) compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca, NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc, 'Phishing Dataset') compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca, NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc, 'Phishing Dataset') compare_learn_time(train_samp_full, NN_train_score_full, NN_train_score_pca, NN_train_score_ica, NN_train_score_rca, NN_train_score_rfc, 'Phishing Dataset') print("Training Clustered Label") # Training NN on Projected data with cluster labels km = KMeans(n_clusters=2, n_init=10, random_state=100, n_jobs=-1).fit(X_p) km_labels = km.labels_ em = EM(n_components=30, covariance_type='diag', n_init=1, warm_start=True, random_state=100).fit(X_p) em_labels = em.predict(X_p) clust_full = addclusters(X_p, km_labels, em_labels) clust_pca = addclusters(pca_phish, km_labels, em_labels) clust_ica = addclusters(ica_phish, km_labels, em_labels) clust_rca = addclusters(rca_phish, km_labels, em_labels) clust_rfc = addclusters(rfc_phish, km_labels, em_labels) print("Training Clustered - Original") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_full), np.array(Y_p), test_size=0.20) full_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_full, NN_train_score_full, NN_fit_time_full, NN_pred_time_full = plot_learning_curve( full_est, X_train, y_train, title="Neural Net Phishing with Clusters: Full") final_classifier_evaluation(full_est, X_train, X_test, y_train, y_test) print("Training Clustered - PCA") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_pca), np.array(Y_p), test_size=0.20) pca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_pca, NN_train_score_pca, NN_fit_time_pca, NN_pred_time_pca = plot_learning_curve( pca_est, X_train, y_train, title="Neural Net Phishing with Clusters: PCA") final_classifier_evaluation(pca_est, X_train, X_test, y_train, y_test) print("Training Clustered - ICA") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_ica), np.array(Y_p), test_size=0.20) ica_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_ica, NN_train_score_ica, NN_fit_time_ica, NN_pred_time_ica = plot_learning_curve( ica_est, X_train, y_train, title="Neural Net Phishing with Clusters: ICA") final_classifier_evaluation(ica_est, X_train, X_test, y_train, y_test) print("Training Clustered - RCA") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rca), np.array(Y_p), test_size=0.20) rca_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rca, NN_train_score_rca, NN_fit_time_rca, NN_pred_time_rca = plot_learning_curve( rca_est, X_train, y_train, title="Neural Net Phishing with Clusters: RCA") final_classifier_evaluation(rca_est, X_train, X_test, y_train, y_test) print("Training Clustered - RFC") X_train, X_test, y_train, y_test = train_test_split(np.array(clust_rfc), np.array(Y_p), test_size=0.20) rfc_est = MLPClassifier(hidden_layer_sizes=(50, ), solver='adam', activation='logistic', learning_rate_init=0.01, random_state=100) train_samp_rfc, NN_train_score_rfc, NN_fit_time_rfc, NN_pred_time_rfc = plot_learning_curve( rfc_est, X_train, y_train, title="Neural Net Phishing with Clusters: RFC") final_classifier_evaluation(rfc_est, X_train, X_test, y_train, y_test) compare_fit_time(train_samp_full, NN_fit_time_full, NN_fit_time_pca, NN_fit_time_ica, NN_fit_time_rca, NN_fit_time_rfc, 'Phishing Dataset') compare_pred_time(train_samp_full, NN_pred_time_full, NN_pred_time_pca, NN_pred_time_ica, NN_pred_time_rca, NN_pred_time_rfc, 'Phishing Dataset') compare_learn_time(train_samp_full, NN_train_score_full, NN_train_score_pca, NN_train_score_ica, NN_train_score_rca, NN_train_score_rfc, 'Phishing Dataset')
plt.show() fig = plt.figure() ax = fig.add_subplot(111) ax.plot(kdist, aic_scores, label='AIC') ax.plot(kdist, bic_scores,label='BIC') plt.grid(True) plt.xlabel('# Clusters') plt.ylabel('Model Complexity Score') plt.title(title + 'Exp Max Model Complexity') plt.legend(loc="best") plt.show() run_EM(diabetes_X,diabetes_Y,'Diabetes Data') em = EM(n_components=24, covariance_type='diag', warm_start=True, random_state=100) X_train, X_test, y_train, y_test = train_test_split(np.array(creditX),np.array(creditY), test_size=0.25) run_EM(X_train,y_train,'Credit Data') em = EM(n_components=41, covariance_type='diag', warm_start=True, random_state=100) def run_PCA(X,y,title): pca = PCA(random_state=5).fit(X) #for all components cum_var = np.cumsum(pca.explained_variance_ratio_) fig, ax1 = plt.subplots() ax1.plot(list(range(len(pca.singular_values_))), pca.singular_values_, 'm-')