def plot_accuracy(seed): colors = ['navy', 'darkorange'] target_names = ['High Risk', 'Low Risk'] def make_ellipses(gmm, ax): for n, color in enumerate(colors): if gmm.covariance_type == 'full': covariances = gmm.covariances_[n][:2, :2] elif gmm.covariance_type == 'tied': covariances = gmm.covariances_[:2, :2] elif gmm.covariance_type == 'diag': covariances = np.diag(gmm.covariances_[n][:2]) elif gmm.covariance_type == 'spherical': covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n] v, w = np.linalg.eigh(covariances) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan2(u[1], u[0]) angle = 180 * angle / np.pi # convert to degrees v = 2. * np.sqrt(2.) * np.sqrt(v) ell = mpl.patches.Ellipse(gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color) ell.set_clip_box(ax.bbox) ell.set_alpha(0.5) ax.add_artist(ell) ax.set_aspect('equal', 'datalim') X, y = create_dataset() # Break up the dataset into non-overlapping training (75%) and testing # (25%) sets. skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) # Only take the first fold. train_index, test_index = next(iter(skf.split(X, y))) X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] n_classes = len(np.unique(y_train)) # Try GMMs using different types of covariances. estimators = { cov_type: GaussianMixture(n_components=n_classes, covariance_type=cov_type, max_iter=20, random_state=0) for cov_type in ['spherical', 'diag', 'tied', 'full'] } n_estimators = len(estimators) plt.figure(figsize=(3 * n_estimators // 2, 6)) plt.subplots_adjust(bottom=.05, top=0.95, hspace=.15, wspace=.05, left=.02, right=.98) for index, (name, estimator) in enumerate(estimators.items()): # Since we have class labels for the training data, we can # initialize the GMM parameters in a supervised manner. estimator.means_init = np.array( [X_train[y_train == i].mean(axis=0) for i in range(n_classes)]) # Train the other parameters using the EM algorithm. estimator.fit(X_train) h = plt.subplot(2, n_estimators // 2, index + 1) make_ellipses(estimator, h) for n, color in enumerate(colors): data = X[y == n] plt.scatter(data[:, 0], data[:, 1], s=2, color=color, label=target_names[n]) # Plot the test data with crosses for n, color in enumerate(colors): data = X_test[y_test == n] plt.scatter(data[:, 0], data[:, 1], s=20 * 8, marker='x', color=color) y_train_pred = estimator.predict(X_train) train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100 plt.text(0.05, 0.9, 'Train accuracy: %.1f' % train_accuracy, transform=h.transAxes, fontsize=(18)) y_test_pred = estimator.predict(X_test) test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100 plt.text(0.05, 0.8, 'Test accuracy: %.1f' % test_accuracy, transform=h.transAxes, fontsize=(18)) plt.title('GMM: K=2, Cv-type = {}'.format(name)) plt.legend(scatterpoints=40, loc='lower right', prop=dict(size=25)) plt.show()
draw_ellipse(mean, cov, alpha=weights * w_factor, edgecolor=color, facecolor=color, linewidth=4) ax.set(title='Plot of the best fitting GMM') plt.show() if __name__ == '__main__': # Create data set seed = 56 np.random.seed(seed) X, y = create_dataset() n_components_range = range(1, 3) cv_types = ['full'] models = create_gmm_models(cv_types, n_components_range) best_gmm, score = my_cv(X, y, models, K_out=10, K_in=10, seed=seed) best_gmm.fit(X) clf = best_gmm.predict(X) cent = best_gmm.means_ covars = best_gmm.covariances_ plot_accuracy(seed) plot_gmms(cv_types, n_components_range, best_gmm) #Cluster validity
d = lambda x, y: np.sqrt(((x - y)**2).sum()) densities = kde(data, K) N = len(data) ards = np.zeros(N) for i, obs in enumerate(data): distances = np.zeros(N) for j, obs2 in enumerate(data): if i == j: continue distances[j] += d(obs, obs2) distances.sort() ards[i] = distances[:K].sum() ards = densities / ards * K ards.sort() plt.bar(np.linspace(1, N, N), ards) plt.title("KNN Average relative density") plt.xlabel("Observation") plt.ylabel("ARD") plt.show() if __name__ == "__main__": data, target = create_dataset() # gdk(data) ard(data, 10)
from data_cleaner import create_dataset from sklearn import metrics from sklearn.cluster import AgglomerativeClustering, KMeans from sklearn.datasets import make_blobs from sklearn.mixture import GaussianMixture from sklearn.decomposition import PCA from scipy.spatial import distance from scipy.cluster import hierarchy from matplotlib.pyplot import figure, title, plot, ylim, legend, show import numpy as np import matplotlib.pyplot as plt data, y_true = create_dataset() Rand = np.zeros((4, )) Jaccard = np.zeros((4, )) NMI = np.zeros((4, )) KCluster = np.zeros((4, )) linkage_list = ["complete", "average", "single", "ward"] for i, link in enumerate(linkage_list): model = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage=link) y_pred = model.fit_predict(data) Rand[i] = metrics.adjusted_rand_score(y_true, y_pred) Jaccard[i] = metrics.jaccard_score(
def plot_bic(n_components_range): # Number of samples per component n_samples = 500 # Generate random sample, two components np.random.seed(56) X, y = create_dataset() lowest_bic = np.infty bic = [] cv_types = ['spherical', 'tied', 'diag', 'full'] for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(X) bic.append(gmm.bic(X)) if bic[-1] < lowest_bic: lowest_bic = bic[-1] best_gmm = gmm bic = np.array(bic) color_iter = itertools.cycle( ['navy', 'turquoise', 'cornflowerblue', 'darkorange']) clf = best_gmm bars = [] # Plot the BIC scores plt.figure(figsize=(8, 6)) spl = plt.subplot(2, 1, 1) for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)): xpos = np.array(n_components_range) + .2 * (i - 2) bars.append( plt.bar(xpos, bic[i * len(n_components_range):(i + 1) * len(n_components_range)], width=.2, color=color)) plt.xticks(n_components_range) plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()]) plt.title('BIC score per model') xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\ .2 * np.floor(bic.argmin() / len(n_components_range)) plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14) spl.set_xlabel('Number of components') spl.legend([b[0] for b in bars], cv_types) # Plot the winner splot = plt.subplot(2, 1, 2) Y_ = clf.predict(X) for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_, color_iter)): v, w = linalg.eigh(cov) if not np.any(Y_ == i): continue plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color) # Plot an ellipse to show the Gaussian component angle = np.arctan2(w[0][1], w[0][0]) angle = 180. * angle / np.pi # convert to degrees v = 2. * np.sqrt(2.) * np.sqrt(v) ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color) ell.set_clip_box(splot.bbox) ell.set_alpha(.5) splot.add_artist(ell) plt.xticks(()) plt.yticks(()) plt.title('Selected GMM: full model, 2 components') plt.subplots_adjust(hspace=.35, bottom=.02) plt.show()
from data_cleaner import create_dataset import numpy as np from apyori import apriori data, pred = create_dataset() data = np.asarray(data) X = np.zeros(data.shape) means = np.mean(data,axis=0) for i, obs in enumerate(data): X[i] = obs > means label = [ "PV", "TA", "GD", "VE", "TE", "PS", "NW", "JA" ] print(X) def mat2transactions(X, labels=[]): T = [] for i in range(X.shape[0]): l = np.nonzero(X[i, :])[0].tolist() if labels: