alpha = 0.3 plt.rcParams['axes.prop_cycle'] = cycler('color', ACTIVE_COLORS) plt.rcParams['text.color'] = LINE_COLOR plt.rcParams['patch.edgecolor'] = LINE_COLOR plt.rcParams['patch.facecolor'] = FILL_COLOR plt.rcParams['axes.facecolor'] = my_palette['white'] plt.rcParams['axes.edgecolor'] = my_palette['grey'] plt.rcParams['axes.labelcolor'] = my_palette['grey'] plt.rcParams['xtick.color'] = my_palette['grey'] plt.rcParams['ytick.color'] = my_palette['grey'] plt.rcParams['grid.color'] = my_palette['light grey'] plt.rcParams['boxplot.boxprops.color'] = FILL_COLOR plt.rcParams['boxplot.capprops.color'] = LINE_COLOR plt.rcParams['boxplot.flierprops.color'] = my_palette['pink'] plt.rcParams['boxplot.flierprops.markeredgecolor'] = FILL_COLOR plt.rcParams['boxplot.flierprops.markerfacecolor'] = FILL_COLOR plt.rcParams['boxplot.whiskerprops.color'] = LINE_COLOR plt.rcParams['boxplot.meanprops.color'] = my_palette['purple'] plt.rcParams['boxplot.meanprops.markeredgecolor'] = my_palette['purple'] plt.rcParams['boxplot.meanprops.markerfacecolor'] = my_palette['purple'] plt.rcParams['boxplot.medianprops.color'] = my_palette['green'] plt.rcParams['axes.prop_cycle'] = cycler('color', ACTIVE_COLORS) plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], accuracies, ylabel='Accuracy') plt.suptitle('HFCR Accuracy Comparison') plt.savefig(graphsDir + 'HFCR Accuracy Comparison')
graphsDir = './Results/Pattern Mining/' if not os.path.exists(graphsDir): os.makedirs(graphsDir) bin_strategies = ['Uniform', 'Quantile', 'Kmeans'] n_bins = [3, 5, 10] values = { 'with 3 bins': [18200, 20500, 20000], 'with 5 bins': [18200, 12000, 18000], 'with 10 bins': [12500, 5400, 10500], } plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(bin_strategies, values, ylabel='Number of Patterns') plt.suptitle('Number of Patterns for a support of 0.01') plt.savefig(graphsDir + 'HFCR Number of Patterns') values = { 'with 3 bins': [30, 59, 29], 'with 5 bins': [45, 81, 40], 'with 10 bins': [52, 50, 40], } plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(bin_strategies, values, ylabel='Lift Score') plt.suptitle('Lift of the top 10% for a support of 0.01') plt.savefig(graphsDir + 'HFCR Lift Score') bin_strategies = ['0.25', '0.57']
min_class = target_count.idxmin() ind_min_class = target_count.index.get_loc(min_class) print('Minority class:', target_count[ind_min_class]) print('Majority class:', target_count[1-ind_min_class]) print('Proportion:', round(target_count[ind_min_class] / target_count[1-ind_min_class], 2), ': 1') RANDOM_STATE = 42 values = {'Original': [target_count.values[ind_min_class], target_count.values[1-ind_min_class]]} df_class_min = unbal[unbal['DEATH_EVENT'] == min_class] df_class_max = unbal[unbal['DEATH_EVENT'] != min_class] df_under = df_class_max.sample(len(df_class_min)) values['UnderSample'] = [target_count.values[ind_min_class], len(df_under)] df_over = df_class_min.sample(len(df_class_max), replace=True) values['OverSample'] = [len(df_over), target_count.values[1-ind_min_class]] smote = SMOTE(sampling_strategy='minority', random_state=RANDOM_STATE) y = unbal.pop('DEATH_EVENT').values X = unbal.values smote_X, smote_y = smote.fit_sample(X, y) smote_target_count = pd.Series(smote_y).value_counts() values['SMOTE'] = [smote_target_count.values[ind_min_class], smote_target_count.values[1-ind_min_class]] fig = plt.figure() ds.multiple_bar_chart([target_count.index[ind_min_class], target_count.index[1-ind_min_class]], values, title='Target', xlabel='frequency', ylabel='Class balance') plt.savefig(graphsDir + 'HFCR Balancing - Class Balanced')
y: np.ndarray = data.pop('DEATH_EVENT').values X: np.ndarray = data.values labels: np.ndarray = pd.unique(y) skf = StratifiedKFold(n_splits=n_splits, shuffle=True) splitIterator = iter(skf.split(X, y)) splitCounter = 1 for model in splitIterator: trnX = X[model[0]] trnY = y[model[0]] tstX = X[model[1]] tstY = y[model[1]] values['Train Split ' + str(splitCounter)] = [ len(np.delete(trnY, np.argwhere(trnY == negative))), len(np.delete(trnY, np.argwhere(trnY == positive))) ] values['Test Split ' + str(splitCounter)] = [ len(np.delete(tstY, np.argwhere(tstY == negative))), len(np.delete(tstY, np.argwhere(tstY == positive))) ] splitCounter += 1 plt.figure(figsize=(7, 7)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.suptitle('HFCR Training Strategies') plt.savefig(graphsDir + 'HFCR Training Strategies')
centers = ds.compute_centroids(data, labels) mse.append(ds.compute_mse(data.values, labels, centers)) mae.append(ds.compute_mae(data.values, labels, centers)) sc.append(silhouette_score(data, labels)) db.append(davies_bouldin_score(data, labels)) ds.plot_clusters(data, eixo_x, eixo_y, labels, centers, k, f'Hierarchical k={k} metric={m} link={link}', ax=axs[i,j]) values_mse[m] = mse values_mae[m] = mae values_sc[m] = sc values_db[m] = db plt.suptitle('QOT Clustering - Metric (Hierarchical) after PCA') plt.savefig(subDir + 'QOT Clustering - Metric (Hierarchical) after PCA') print('QOT Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB after PCA') _, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.multiple_bar_chart(LINKS, values_mse, title=f'Hierarchical MSE', xlabel='metric', ylabel='MSE', ax=ax[0, 0]) ds.multiple_bar_chart(LINKS, values_mae, title=f'Hierarchical MAE', xlabel='metric', ylabel='MAE', ax=ax[0, 1]) ds.multiple_bar_chart(LINKS, values_sc, title=f'Hierarchical SC', xlabel='metric', ylabel='SC', ax=ax[0, 2], percentage=True) ds.multiple_bar_chart(LINKS, values_db, title=f'Hierarchical DB', xlabel='metric', ylabel='DB', ax=ax[0, 3]) plt.suptitle('QOT Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB after PCA') plt.savefig(subDir + 'QOT Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB after PCA') count += 1 plt.close("all") plt.clf() features_file.close()
ax=axs[0, k], title='Overfitting for dist = %s' % (d), xlabel='K Neighbours', ylabel='accuracy', percentage=True) plt.suptitle('QOT Overfitting - KNN') plt.savefig(subDir + 'QOT Overfitting - KNN') clf = knn = KNeighborsClassifier(n_neighbors=best[0], metric=best[1]) clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) ds.plot_evaluation_results(["negative", "positive"], trnY, prd_trn, tstY, prd_tst) plt.suptitle('QOT KNN - ' + key + '- Performance & Confusion matrix - %d neighbors and %s' % (best[0], best[1])) plt.savefig(subDir + 'QOT KNN - ' + key + ' - Performance & Confusion matrix') plt.close("all") plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], best_accuracies, ylabel='Accuracy') plt.suptitle('QOT Sampling & Feature Selection') plt.savefig(graphsDir + 'QOT Sampling & Feature Selection') plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], recalls, ylabel='Recall') plt.suptitle('QOT Recall Comparison') plt.savefig(graphsDir + 'QOT Recall Comparison')
break if (last_accuracy > best_accuracy and best_accuracy != -1): best_accuracy = last_accuracy last_accuracy = -1 count += offset offset -= 1 elif (best_accuracy == -1): best_accuracy = last_accuracy count += 1 else: count += 1 offset -= 1 plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], values_by_criteria, ylabel='Accuracy') plt.suptitle('HFCR Gradient Boosting Criteria') plt.savefig(subDir + 'HFCR Gradient Boosting Criteria') plt.style.use('dslabs.mplstyle') my_palette = { 'yellow': '#ECD474', 'pale orange': '#E9AE4E', 'salmon': '#E2A36B', 'orange': '#F79522', 'dark orange': '#D7725E', 'pale acqua': '#92C4AF', 'acqua': '#64B29E', 'marine': '#3D9EA9',
plt.rcParams['boxplot.boxprops.color'] = FILL_COLOR plt.rcParams['boxplot.capprops.color'] = LINE_COLOR plt.rcParams['boxplot.flierprops.color'] = my_palette['pink'] plt.rcParams['boxplot.flierprops.markeredgecolor'] = FILL_COLOR plt.rcParams['boxplot.flierprops.markerfacecolor'] = FILL_COLOR plt.rcParams['boxplot.whiskerprops.color'] = LINE_COLOR plt.rcParams['boxplot.meanprops.color'] = my_palette['purple'] plt.rcParams['boxplot.meanprops.markeredgecolor'] = my_palette['purple'] plt.rcParams['boxplot.meanprops.markerfacecolor'] = my_palette['purple'] plt.rcParams['boxplot.medianprops.color'] = my_palette['green'] plt.rcParams['axes.prop_cycle'] = cycler('color', ACTIVE_COLORS) plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], accuracies, ylabel='Accuracy') plt.suptitle('HFCR Accuracy Comparison') plt.savefig(graphsDir + 'HFCR Accuracy Comparison') plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], recalls, ylabel='Recall') plt.suptitle('HFCR Recall Comparison') plt.savefig(graphsDir + 'HFCR Recall Comparison') plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], specificities, ylabel='Specificity') plt.suptitle('HFCR Specificity Comparison') plt.savefig(graphsDir + 'HFCR Specificity Comparison') plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], precisions, ylabel='Precision')
def pca_function(data, subDir): data.pop('DEATH_EVENT') variables = data.columns.values eixo_x = 0 eixo_y = 4 eixo_z = 7 plt.figure() plt.xlabel(variables[eixo_y]) plt.ylabel(variables[eixo_z]) plt.scatter(data.iloc[:, eixo_y], data.iloc[:, eixo_z]) print('HFCR Feature Extraction - PCA') mean = (data.mean(axis=0)).tolist() centered_data = data - mean cov_mtx = centered_data.cov() eigvals, eigvecs = np.linalg.eig(cov_mtx) pca = PCA() pca.fit(centered_data) PC = pca.components_ var = pca.explained_variance_ # PLOT EXPLAINED VARIANCE RATIO fig = plt.figure(figsize=(4, 4)) plt.title('Explained variance ratio') plt.xlabel('PC') plt.ylabel('ratio') x_values = [str(i) for i in range(1, len(pca.components_) + 1)] bwidth = 0.5 ax = plt.gca() ax.set_xticklabels(x_values) ax.set_ylim(0.0, 1.0) ax.bar(x_values, pca.explained_variance_ratio_, width=bwidth) ax.plot(pca.explained_variance_ratio_) for i, v in enumerate(pca.explained_variance_ratio_): ax.text(i, v + 0.05, f'{v*100:.1f}', ha='center', fontweight='bold') plt.suptitle('HFCR Feature Extraction - PCA') plt.savefig(subDir + 'HFCR Feature Extraction - PCA') print('HFCR Feature Extraction - PCA 2') transf = pca.transform(data) _, axs = plt.subplots(1, 2, figsize=(2 * 5, 1 * 5), squeeze=False) axs[0, 0].set_xlabel(variables[eixo_y]) axs[0, 0].set_ylabel(variables[eixo_z]) axs[0, 0].scatter(data.iloc[:, eixo_y], data.iloc[:, eixo_z]) axs[0, 1].set_xlabel('PC1') axs[0, 1].set_ylabel('PC2') axs[0, 1].scatter(transf[:, 0], transf[:, 1]) plt.suptitle('HFCR Feature Extraction - PCA') plt.savefig(subDir + 'HFCR Feature Extraction - PCA') print('Clustering after PCA') data = pd.DataFrame(transf[:, :2], columns=['PC1', 'PC2']) eixo_x = 0 eixo_y = 1 N_CLUSTERS = [2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29] rows, cols = ds.choose_grid(len(N_CLUSTERS)) print('HFCR Clustering - K-Means after PCA') mse: list = [] mae: list = [] sc: list = [] db: list = [] _, axs = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5), squeeze=False) i, j = 0, 0 for n in range(len(N_CLUSTERS)): k = N_CLUSTERS[n] estimator = KMeans(n_clusters=k) estimator.fit(data) mse.append(estimator.inertia_) mae.append( ds.compute_mae(data.values, estimator.labels_, estimator.cluster_centers_)) sc.append(silhouette_score(data, estimator.labels_)) db.append(davies_bouldin_score(data, estimator.labels_)) ds.plot_clusters(data, eixo_x, eixo_y, estimator.labels_.astype(float), estimator.cluster_centers_, k, f'KMeans k={k}', ax=axs[i, j]) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) plt.suptitle('HFCR Clustering - K-Means after PCA') plt.savefig(subDir + 'HFCR Clustering - K-Means after PCA') print( 'HFCR Clustering - K-Means after PCA MSE vs MAE vs SC vs DB after PCA') fig, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.plot_line(N_CLUSTERS, mse, title='KMeans MSE', xlabel='k', ylabel='MSE', ax=ax[0, 0]) ds.plot_line(N_CLUSTERS, mae, title='KMeans MAE', xlabel='k', ylabel='MAE', ax=ax[0, 1]) ds.plot_line(N_CLUSTERS, sc, title='KMeans SC', xlabel='k', ylabel='SC', ax=ax[0, 2], percentage=True) ds.plot_line(N_CLUSTERS, db, title='KMeans DB', xlabel='k', ylabel='DB', ax=ax[0, 3]) plt.suptitle( 'HFCR Clustering - K-Means after PCA MSE vs MAE vs SC vs DB after PCA') plt.savefig( subDir + 'HFCR Clustering - K-Means after PCA MSE vs MAE vs SC vs DB after PCA') print('HFCR Clustering - Expectation-Maximization after PCA') mse: list = [] mae: list = [] sc: list = [] db: list = [] _, axs = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5), squeeze=False) i, j = 0, 0 for n in range(len(N_CLUSTERS)): k = N_CLUSTERS[n] estimator = GaussianMixture(n_components=k) estimator.fit(data) labels = estimator.predict(data) mse.append(ds.compute_mse(data.values, labels, estimator.means_)) mae.append(ds.compute_mae(data.values, labels, estimator.means_)) sc.append(silhouette_score(data, labels)) db.append(davies_bouldin_score(data, labels)) ds.plot_clusters(data, eixo_x, eixo_y, labels.astype(float), estimator.means_, k, f'EM k={k}', ax=axs[i, j]) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) plt.suptitle('HFCR Clustering - Expectation-Maximization after PCA') plt.savefig(subDir + 'HFCR Clustering - Expectation-Maximization after PCA') print( 'HFCR Clustering - Expectation-Maximization MSE vs MAE vs SC vs DB after PCA' ) fig, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.plot_line(N_CLUSTERS, mse, title='EM MSE', xlabel='k', ylabel='MSE', ax=ax[0, 0]) ds.plot_line(N_CLUSTERS, mae, title='EM MAE', xlabel='k', ylabel='MAE', ax=ax[0, 1]) ds.plot_line(N_CLUSTERS, sc, title='EM SC', xlabel='k', ylabel='SC', ax=ax[0, 2], percentage=True) ds.plot_line(N_CLUSTERS, db, title='EM DB', xlabel='k', ylabel='DB', ax=ax[0, 3]) plt.suptitle( 'HFCR Clustering - Expectation-Maximization MSE vs MAE vs SC vs DB after PCA' ) plt.savefig( subDir + 'HFCR Clustering - Expectation-Maximization MSE vs MAE vs SC vs DB after PCA' ) print('HFCR Clustering - EPS (Density-based) after PCA') EPS = [2.5, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] mse: list = [] mae: list = [] sc: list = [] db: list = [] rows, cols = ds.choose_grid(len(EPS)) _, axs = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5), squeeze=False) i, j = 0, 0 for n in range(len(EPS)): estimator = DBSCAN(eps=EPS[n], min_samples=2) estimator.fit(data) labels = estimator.labels_ k = len(set(labels)) - (1 if -1 in labels else 0) if k > 1: centers = ds.compute_centroids(data, labels) mse.append(ds.compute_mse(data.values, labels, centers)) mae.append(ds.compute_mae(data.values, labels, centers)) sc.append(silhouette_score(data, labels)) db.append(davies_bouldin_score(data, labels)) ds.plot_clusters(data, eixo_x, eixo_y, labels.astype(float), estimator.components_, k, f'DBSCAN eps={EPS[n]} k={k}', ax=axs[i, j]) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) else: mse.append(0) mae.append(0) sc.append(0) db.append(0) plt.suptitle('HFCR Clustering - EPS (Density-based) after PCA') plt.savefig(subDir + 'HFCR Clustering - EPS (Density-based) after PCA') print( 'HFCR Clustering - EPS (Density-based) MSE vs MAE vs SC vs DB after PCA' ) fig, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.plot_line(EPS, mse, title='DBSCAN MSE', xlabel='eps', ylabel='MSE', ax=ax[0, 0]) ds.plot_line(EPS, mae, title='DBSCAN MAE', xlabel='eps', ylabel='MAE', ax=ax[0, 1]) ds.plot_line(EPS, sc, title='DBSCAN SC', xlabel='eps', ylabel='SC', ax=ax[0, 2], percentage=True) ds.plot_line(EPS, db, title='DBSCAN DB', xlabel='eps', ylabel='DB', ax=ax[0, 3]) plt.suptitle( 'HFCR Clustering - EPS (Density-based) MSE vs MAE vs SC vs DB after PCA' ) plt.savefig( subDir + 'HFCR Clustering - EPS (Density-based) MSE vs MAE vs SC vs DB after PCA' ) print('HFCR Clustering - Metric (Density-based) after PCA') METRICS = ['euclidean', 'cityblock', 'chebyshev', 'cosine', 'jaccard'] distances = [] for m in METRICS: dist = np.mean(np.mean(squareform(pdist(data.values, metric=m)))) distances.append(dist) print('AVG distances among records', distances) distances[0] = 80 distances[1] = 50 distances[2] = 80 distances[3] = 0.0005 distances[4] = 0.0009 print('CHOSEN EPS', distances) mse: list = [] mae: list = [] sc: list = [] db: list = [] rows, cols = ds.choose_grid(len(METRICS)) _, axs = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5), squeeze=False) i, j = 0, 0 for n in range(len(METRICS)): estimator = DBSCAN(eps=distances[n], min_samples=2, metric=METRICS[n]) estimator.fit(data) labels = estimator.labels_ k = len(set(labels)) - (1 if -1 in labels else 0) if k > 1: centers = ds.compute_centroids(data, labels) mse.append(ds.compute_mse(data.values, labels, centers)) mae.append(ds.compute_mae(data.values, labels, centers)) sc.append(silhouette_score(data, labels)) db.append(davies_bouldin_score(data, labels)) ds.plot_clusters( data, eixo_x, eixo_y, labels.astype(float), estimator.components_, k, f'DBSCAN metric={METRICS[n]} eps={distances[n]:.2f} k={k}', ax=axs[i, j]) else: print(k) mse.append(0) mae.append(0) sc.append(0) db.append(0) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) plt.suptitle('HFCR Clustering - Metric (Density-based) after PCA') plt.savefig(subDir + 'HFCR Clustering - Metric (Density-based) after PCA') print( 'HFCR Clustering - Metric (Density-based) MSE vs MAE vs SC vs DB after PCA' ) fig, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.bar_chart(METRICS, mse, title='DBSCAN MSE', xlabel='metric', ylabel='MSE', ax=ax[0, 0]) ds.bar_chart(METRICS, mae, title='DBSCAN MAE', xlabel='metric', ylabel='MAE', ax=ax[0, 1]) ds.bar_chart(METRICS, sc, title='DBSCAN SC', xlabel='metric', ylabel='SC', ax=ax[0, 2], percentage=True) ds.bar_chart(METRICS, db, title='DBSCAN DB', xlabel='metric', ylabel='DB', ax=ax[0, 3]) plt.suptitle( 'HFCR Clustering - Metric (Density-based) MSE vs MAE vs SC vs DB after PCA' ) plt.savefig( subDir + 'HFCR Clustering - Metric (Density-based) MSE vs MAE vs SC vs DB after PCA' ) print('HFCR Clustering - Hierarchical after PCA') mse: list = [] mae: list = [] sc: list = [] db: list = [] rows, cols = ds.choose_grid(len(N_CLUSTERS)) _, axs = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5), squeeze=False) i, j = 0, 0 for n in range(len(N_CLUSTERS)): k = N_CLUSTERS[n] estimator = AgglomerativeClustering(n_clusters=k) estimator.fit(data) labels = estimator.labels_ centers = ds.compute_centroids(data, labels) mse.append(ds.compute_mse(data.values, labels, centers)) mae.append(ds.compute_mae(data.values, labels, centers)) sc.append(silhouette_score(data, labels)) db.append(davies_bouldin_score(data, labels)) ds.plot_clusters(data, eixo_x, eixo_y, labels, centers, k, f'Hierarchical k={k}', ax=axs[i, j]) i, j = (i + 1, 0) if (n + 1) % cols == 0 else (i, j + 1) plt.suptitle('HFCR Clustering - Hierarchical after PCA') plt.savefig(subDir + 'HFCR Clustering - Hierarchical after PCA') print('HFCR Clustering - Hierarchical MSE vs MAE vs SC vs DB after PCA') fig, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.plot_line(N_CLUSTERS, mse, title='Hierarchical MSE', xlabel='k', ylabel='MSE', ax=ax[0, 0]) ds.plot_line(N_CLUSTERS, mae, title='Hierarchical MAE', xlabel='k', ylabel='MAE', ax=ax[0, 1]) ds.plot_line(N_CLUSTERS, sc, title='Hierarchical SC', xlabel='k', ylabel='SC', ax=ax[0, 2], percentage=True) ds.plot_line(N_CLUSTERS, db, title='Hierarchical DB', xlabel='k', ylabel='DB', ax=ax[0, 3]) plt.suptitle( 'HFCR Clustering - Hierarchical MSE vs MAE vs SC vs DB after PCA') plt.savefig( subDir + 'HFCR Clustering - Hierarchical MSE vs MAE vs SC vs DB after PCA') print('HFCR Clustering - Metric (Hierarchical) after PCA') METRICS = ['euclidean', 'cityblock', 'chebyshev', 'cosine', 'jaccard'] LINKS = ['complete', 'average'] k = 3 values_mse = {} values_mae = {} values_sc = {} values_db = {} rows = len(METRICS) cols = len(LINKS) _, axs = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5), squeeze=False) for i in range(len(METRICS)): mse: list = [] mae: list = [] sc: list = [] db: list = [] m = METRICS[i] for j in range(len(LINKS)): link = LINKS[j] estimator = AgglomerativeClustering(n_clusters=k, linkage=link, affinity=m) estimator.fit(data) labels = estimator.labels_ centers = ds.compute_centroids(data, labels) mse.append(ds.compute_mse(data.values, labels, centers)) mae.append(ds.compute_mae(data.values, labels, centers)) sc.append(silhouette_score(data, labels)) db.append(davies_bouldin_score(data, labels)) ds.plot_clusters(data, eixo_x, eixo_y, labels, centers, k, f'Hierarchical k={k} metric={m} link={link}', ax=axs[i, j]) values_mse[m] = mse values_mae[m] = mae values_sc[m] = sc values_db[m] = db plt.suptitle('HFCR Clustering - Metric (Hierarchical) after PCA') plt.savefig(subDir + 'HFCR Clustering - Metric (Hierarchical) after PCA') print( 'HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB after PCA' ) _, ax = plt.subplots(1, 4, figsize=(10, 3), squeeze=False) ds.multiple_bar_chart(LINKS, values_mse, title=f'Hierarchical MSE', xlabel='metric', ylabel='MSE', ax=ax[0, 0]) ds.multiple_bar_chart(LINKS, values_mae, title=f'Hierarchical MAE', xlabel='metric', ylabel='MAE', ax=ax[0, 1]) ds.multiple_bar_chart(LINKS, values_sc, title=f'Hierarchical SC', xlabel='metric', ylabel='SC', ax=ax[0, 2], percentage=True) ds.multiple_bar_chart(LINKS, values_db, title=f'Hierarchical DB', xlabel='metric', ylabel='DB', ax=ax[0, 3]) plt.suptitle( 'HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB after PCA' ) plt.savefig( subDir + 'HFCR Clustering - Metric (Hierarchical) MSE vs MAE vs SC vs DB after PCA' ) plt.close("all") plt.clf()
plt.rcParams['boxplot.medianprops.color'] = my_palette['green'] plt.rcParams['axes.prop_cycle'] = cycler('color', ACTIVE_COLORS) graphsDir = './Results/Recalls/Gradient Boosting/' if not os.path.exists(graphsDir): os.makedirs(graphsDir) recalls = { 'Original': [1, 0.7368], ' - No Outliers - Original': [1, 0.7368], ' - Scaling - Original': [1, 0.7368], ' - Scaling & Feature Selection - Original': [1, 0.7368], 'UnderSample': [1, 0.7368], ' - No Outliers - UnderSample': [1, 0.6842], ' - No Outliers & Scaling - UnderSample': [1, 0.7368], ' - No Outliers & Feature Selection - UnderSample': [1, 0.7368], 'OverSample': [1, 0.6842], ' - No Outliers - OverSample': [1, 0.6842], ' - Scaling - OverSample': [1, 0.7368], ' - Scaling & Feature Selection - OverSample': [1, 0.7368], 'SMOTE': [1, 0.7368], ' - No Outliers - SMOTE': [1, 0.7368], ' - No Outliers & Scaling - SMOTE': [1, 0.6842], ' - No Outliers & Feature Selection - SMOTE': [1, 0.75], } plt.figure(figsize=(7, 7)) ds.multiple_bar_chart(['Train', 'Test'], recalls, ylabel='Recall') plt.suptitle('HFCR Recall Comparison') plt.savefig(graphsDir + 'HFCR Recall Comparison')