def plot_variance_mean(directory, *args, **kwargs): logger = new_logger('plot_variance_mean', directory) logger.info('entering') X, y = get_mnist(directory) image_size = (28, 28) scaler = StandardScaler().fit( StandardScaler(with_std=False).fit_transform(X) / 255) fig, axs = plt.subplots(1, 2, figsize=(8, 4)) axs[0].imshow(np.resize(scaler.mean_, image_size), cmap=plt.cm.gray_r, interpolation='none') axs[0].imshow(np.resize(scaler.var_, image_size), cmap=plt.cm.gray_r, interpolation='none') axs[0].set_title(r'$\mu$') axs[1].set_title(r'$\sigma^2$') fig.tight_layout() fig.savefig( os.path.join(directory, 'mnist-pixel-variance-and-mean-avgfree.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pixel-variance-and-mean-avgfree.pgf'), format='pgf') logger.info('np.max(scaler.mean_) = {0}, np.max(scaler.var_) = {1}'.format( np.max(scaler.mean_), np.max(scaler.var_))) return
def plot_labels(directory, *args, **kwargs): X, y = get_mnist(directory) # find first digit occurrences idx = np.ones((10, )) * -1 cnt = int(0) while np.any(idx == -1): if idx[int(y[cnt])] == -1.0: idx[int(y[cnt])] = int(cnt) cnt += 1 # display digits fig, axs = plt.subplots(2, 5, figsize=(5, 2)) for i in range(10): axs[i // 5][i % 5].imshow(np.resize(X[int(idx[i])], (28, 28)), cmap=plt.cm.gray_r, interpolation='none') axs[i // 5][i % 5].set_xticks([0, 27]) axs[i // 5][i % 5].set_xticklabels([0, 27]) axs[i // 5][i % 5].set_yticks([0, 27]) axs[i // 5][i % 5].set_yticklabels([0, 27]) fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-visualize.pgf'), format='pgf')
def plot_img_cluster(directory, *args, **kwargs): X, y = get_mnist(directory) img = X[example_image_idx, :] clusterer = KMeans(n_clusters=4, random_state=42) img_clusters = clusterer.fit_predict(img.reshape((784, 1))).reshape( (28, 28)) list_cluster_colors = [ tud_colors['lightblue'], tud_colors['lightgreen'], tud_colors['lightpurple'], tud_colors['gray'] ] img_cluster_colors = np.zeros((28, 28, 4)) for x in range(img_cluster_colors.shape[0]): for y in range(img_cluster_colors.shape[1]): img_cluster_colors[x, y, :] = list_cluster_colors[img_clusters[x, y]] # display digits fig, axs = plt.subplots(1, 1, figsize=(2, 2)) axs.imshow(img_cluster_colors, interpolation='none') axs.set_xticks([0, 27]) axs.set_xticklabels([0, 27]) axs.set_yticks([0, 27]) axs.set_yticklabels([0, 27]) fig.tight_layout() fig.savefig(os.path.join(directory, 'plot-img-clusters.pdf'), format='pdf') plt.imsave(os.path.join(directory, 'plot-img-clusters.png'), img_cluster_colors)
def silhouette_kcluster(directory, *args, **kwargs): logger = new_logger('plot_silhouette_kcluster', directory) logger.info('entering') X, y = get_mnist(directory) X /= 255. pca = PCA(n_components=50, whiten=False, random_state=42).fit(X) # PCA preprocessed X_pca = pca.transform(X) k = [10, 15, 20, 25, 30, 35, 50, 100, 200] dict_results = { 'n_clusters': [], 'pca_n_components': [], 'pca_expl_var': [], 'pca_expl_var_ratio': [], 'silhouette_kcosine': [], 'silhouette_kmeans': [], 'fittime_kcosine': [], 'fittime_kmeans': [] } for n_clusters in k: dict_results['n_clusters'].append(n_clusters) dict_results['pca_n_components'].append(pca.n_components_) dict_results['pca_expl_var'].append(np.sum(pca.explained_variance_)) dict_results['pca_expl_var_ratio'].append( np.sum(pca.explained_variance_ratio_)) # kmeans clusterer_euclid = KMeans(n_clusters=n_clusters, random_state=42) t = time.time() clusterer_euclid.fit(X_pca) dict_results['fittime_kmeans'].append(time.time() - t) dict_results['silhouette_kmeans'].append( silhouette_score(X, clusterer_euclid.predict(X_pca), metric='euclidean', random_state=42)) # kcosine clusterer_cosine = KMeans(n_clusters=n_clusters, random_state=42) t = time.time() clusterer_cosine.fit(X_pca) dict_results['fittime_kcosine'].append(time.time() - t) dict_results['silhouette_kcosine'].append( silhouette_score(X, clusterer_cosine.predict(X_pca), metric='cosine', random_state=42)) # save results to csv with open(os.path.join(directory, 'silhouette_kcluster.csv'), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def plot_var(directory, *args, **kwargs): X, y = get_mnist(directory) scaler = StandardScaler().fit(X) pos = range(0, 28) meanX = [] varX = [] fig, axs = plt.subplots(1, 2, figsize=(5, 2), gridspec_kw={'width_ratios': [1, 1.4]}) example = np.zeros((28, 28, 3)) example[..., 0] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # red example[..., 1] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # green example[..., 2] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # blue for idx in pos: example[idx, idx, :] = tud_colors['orange'][:-1] meanX.append(scaler.mean_[idx * 28 + idx]) varX.append(scaler.var_[idx * 28 + idx]) axs[0].imshow(example, interpolation='none') line_var, = axs[1].plot(pos, varX, color=tud_colors['orange']) ax_mean = axs[1].twinx() line_mean, = ax_mean.plot(pos, meanX, color=tud_colors['lightblue']) axs[1].legend((line_var, line_mean), (r'$\sigma^2$', r'$\mu$'), bbox_to_anchor=(1.2, .5), loc="center left") # fig.suptitle('Feature distribution in MNIST picture') axs[0].set_xticks([0, 27]) axs[0].set_xticklabels([0, 27]) axs[0].set_yticks([0, 27]) axs[0].set_yticklabels([0, 27]) axs[1].set_xlim([0, 27]) axs[1].set_xlabel('position') axs[1].set_ylabel(r'$\sigma^2$', labelpad=-15, loc='top', rotation=0) y_ticks = [0, 2000, 4000, 6000, 8000, 10000, 12000] axs[1].set_yticks(y_ticks) axs[1].set_yticklabels( ['{0:0.0f}k'.format(y_tick / 1000) for y_tick in y_ticks]) # axs[1].tick_params(axis='x', labelrotation=90) ax_mean.set_ylabel(r'$\mu$', labelpad=-5, loc='top', rotation=0) fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-pixel-variance.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pixel-variance.pgf'), format='pgf') # plt.show() return
def elm_bip(directory): self_name = 'elm_bip' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # preprocessing X /= 255. pca = PCA(n_components=50).fit(X) X_preprocessed = pca.transform(X) logger.info('{0} features remaining after preprocessing.'.format( X_preprocessed.shape[1])) # prepare parameter grid param_grid = [{ 'hidden_layer_size': [500, 1000, 2000, 4000], 'activation': ['tanh'], 'alpha': [1e-5], 'random_state': [42] }] # setup estimator estimator = ELMClassifier(input_to_node=BatchIntrinsicPlasticity(), regressor=Ridge()) # setup grid search cv = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', n_jobs=1, verbose=2, refit=False, cv=[(np.arange(0, train_size), np.arange(train_size, 70000))]) # run! cv.fit(X, y_encoded) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) # refine results cv_results = cv.cv_results_ del cv_results['params'] # save results try: with open(os.path.join(directory, '{0}.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') except PermissionError as e: print('Missing privileges: {0}'.format(e))
def plot_image_min_var(directory, *args, **kwargs): X, y = get_mnist(directory) scaler = StandardScaler().fit(X) image_size = (28, 28, 3) example = np.zeros(X[example_image_idx, ...].shape + (3, )) for rgb_idx in range(3): example[..., rgb_idx] = 1. - X[example_image_idx, :] / 255. p1_1 = 1 / 10 * 1 / 10 p1_2 = 1 / 10 var_p1_1 = 255**2 * p1_1 * (1 - p1_1) var_p1_2 = 255**2 * p1_2 * (1 - p1_2) example_min_var_p1_1 = np.copy(example) example_min_var_p1_1[scaler.var_ < var_p1_1, ...] = tud_colors['orange'][:-1] example_min_var_p1_2 = np.copy(example) example_min_var_p1_2[scaler.var_ < var_p1_2, ...] = tud_colors['orange'][:-1] fig, axs = plt.subplots(1, 3, figsize=(5, 2)) axs[0].imshow(np.reshape(example, image_size), interpolation='none') axs[0].set_title('$p_1$=0\noriginal\n$n$={0:d}'.format(len(scaler.var_))) axs[0].set_xticks([0, 27]) axs[0].set_xticklabels([0, 27]) axs[0].set_yticks([0, 27]) axs[0].set_yticklabels([0, 27]) axs[1].imshow(np.reshape(example_min_var_p1_1, image_size), interpolation='none') axs[1].set_title( r'$p_1$={1:0.2f}\n$\sigma^2$ > {0:0.0f}\n$n$={2:d}'.format( var_p1_1, p1_1, np.sum(scaler.var_ > var_p1_1))) axs[1].set_xticks([0, 27]) axs[1].set_xticklabels([0, 27]) axs[1].set_yticks([0, 27]) axs[1].set_yticklabels([0, 27]) axs[2].imshow(np.reshape(example_min_var_p1_2, image_size), interpolation='none') axs[2].set_title( r'$p_1$={1:0.2f}\n$\sigma^2$ > {0:0.0f}\n$n$={2:d}'.format( var_p1_2, p1_2, np.sum(scaler.var_ > var_p1_2))) axs[2].set_xticks([0, 27]) axs[2].set_xticklabels([0, 27]) axs[2].set_yticks([0, 27]) axs[2].set_yticklabels([0, 27]) fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-img-min-var.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-img-min-var.pgf'), format='pgf') # plt.show() return
def plot_poster(directory, *args, **kwargs): X, y = get_mnist(directory) X /= 255. # scale for imsave def scale01(X): return (X - np.min(X)) / (np.max(X) - np.min(X)) # preprocessing pca = PCA(n_components=50).fit(X) clusterer = KMeans(n_clusters=20).fit(X[:10000, :]) # save images # example plt.imsave(os.path.join(os.environ['IMGPATH'], 'example-mnist.png'), X[example_image_idx, :].reshape(28, 28), cmap=plt.cm.gray_r, format='png') # pca component pca_component = scale01(pca.components_[2, :]).reshape(28, 28) pca_example = scale01( np.matmul(X[example_image_idx, :].reshape(1, -1), np.matmul(pca.components_.T, pca.components_))).reshape(28, 28) plt.imsave(os.path.join(os.environ['IMGPATH'], 'pca-component3.png'), pca_component, cmap=plt.cm.gray_r, format='png') plt.imsave(os.path.join(os.environ['IMGPATH'], 'pca50-mnist.png'), pca_example, cmap=plt.cm.gray_r, format='png') # kmeans centroids for idx in [0, 4, 9, 14, 19]: kmeans_centroid = scale01(clusterer.cluster_centers_[idx, ...]).reshape( 28, 28) plt.imsave(os.path.join(os.environ['IMGPATH'], 'kmeans-centroid{0}.png'.format(idx)), kmeans_centroid, cmap=plt.cm.gray_r, format='png') # input weights T = np.load(os.path.join(os.environ['DATAPATH'], 'pca50+kmeans200_matrix.npy'), allow_pickle=True) for idx in [0, 49, 99, 149, 199]: input_weight = scale01(T[:, idx]).reshape(28, 28) plt.imsave(os.path.join(os.environ['IMGPATH'], 'input-weight{0}.png'.format(idx)), input_weight, cmap=plt.cm.gray_r, format='png')
def plot_imbalance(directory): self_name = 'plot_imbalance' logger = new_logger(self_name, directory) X, y = get_mnist(directory) logger.info('successfully fetched {0} datapoints'.format(X.shape[0])) tp_y_unique = np.unique(y.astype(int), return_counts=True) y_unique = tp_y_unique[0][np.argsort(tp_y_unique[0])] y_counts = tp_y_unique[1][np.argsort(tp_y_unique[0])] # y_hist_arr = np.array(y_hist, dtype=float) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6., 2.1)) for idx in range(y_counts.size): plt.text(idx * 1., 3500, '{0:.1f}%'.format(y_counts[idx] / np.sum(y_counts) * 100), color=(1., 1., 1., .2), fontsize='small', horizontalalignment='center') # w = bar.get_with() # plt.text(bar.get_x() - .04, bar.get_y() + .1, '{0:.1f}%'.format()) ax.set_xlim([-.5, 9.5]) ax.set_xticks(y_unique) ax.set_xticklabels(['{0:.0f}'.format(idx) for idx in y_unique]) ax.set_xlabel('label') ax.set_ylim([0, 8000]) ax.set_yticks([7000], minor=True) ax.grid(which='minor', axis='y', alpha=.7, linestyle='--', color=tud_colors['lightgreen']) ax.set_ylabel(r'\#occurrences') ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) # ax.spines['bottom'].set_visible(False) ax.tick_params(axis='x', which='both', bottom=False, top=False) ax.legend(bbox_to_anchor=(1, .5), loc='center left') fig.tight_layout() # fig.patch.set_visible(False) fig.savefig(os.path.join(os.environ['PGFPATH'], '{0}.pgf'.format(self_name)), format='pgf') fig.savefig(os.path.join(directory, '{0}.pdf'.format(self_name)), format='pdf') return
def main(out_path=os.path.join(os.getcwd(), 'preprocessing-mnist'), function_name='labels'): if not os.path.exists(out_path): try: os.makedirs(out_path) except OSError as error: print(error) # quick and dirty # directory = os.path.join(os.getcwd(), 'preprocessing-mnist') directory = out_path logger = new_logger('main') logger.info('{0} called, entering main'.format(__file__)) runtime = [time.time()] # fetch data X, y = get_mnist() runtime.append(time.time()) logger.info('fetch: {0} s'.format(np.diff(runtime[-2:]))) logger.info('X.shape = {0}, y.shape = {1}'.format(X.shape, y.shape)) function_dict = { 'labels': plot_labels, 'plot_pooling': plot_pooling, 'plot_poster': plot_poster, 'histogram': plot_historgram, 'var': plot_var, 'normalized': plot_normalized, 'variance_mean': plot_variance_mean, 'image_min_var': plot_image_min_var, 'plot_pca': plot_pca, 'plot_covariance': plot_covariance, 'plot_imbalance': plot_imbalance, 'plot_img_cluster': plot_img_cluster, } if function_name in function_dict: function_dict[function_name](directory) else: logger.warning('no function {0} found'.format(function_name)) logger.info('{0} finished, return from main'.format(__file__))
def plot_normalized(directory, *args, **kwargs): X, y = get_mnist(directory) X = X / 4 + 100 X_picture_normalization = StandardScaler().fit_transform(X.T).T X_feature_normalization = StandardScaler().fit_transform(X) fig, axs = plt.subplots(1, 3, figsize=(5, 2)) img_idx = example_image_idx axs[0].imshow(np.resize(X[img_idx, :], (28, 28)).astype(int), interpolation='none', cmap=plt.cm.gray_r, norm=Normalize(vmin=0, vmax=255, clip=True)) axs[0].set_title('low contrast') axs[0].set_xticks([0, 27]) axs[0].set_xticklabels([0, 27]) axs[0].set_yticks([0, 27]) axs[0].set_yticklabels([0, 27]) axs[1].imshow(np.resize(X_picture_normalization[img_idx, :], (28, 28)), interpolation='none', cmap=plt.cm.gray_r) axs[1].set_title('picture\nnormalization') axs[1].set_xticks([0, 27]) axs[1].set_xticklabels([0, 27]) axs[1].set_yticks([0, 27]) axs[1].set_yticklabels([0, 27]) axs[2].imshow(np.resize(X_feature_normalization[img_idx, :], (28, 28)), interpolation='none', cmap=plt.cm.gray_r) axs[2].set_title('feature\nnormalization') axs[2].set_xticks([0, 27]) axs[2].set_xticklabels([0, 27]) axs[2].set_yticks([0, 27]) axs[2].set_yticklabels([0, 27]) fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-normalized.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-normalized.pgf'), format='pgf') # plt.show() return
def plot_pooling(directory, *args, **kwargs): X, y = get_mnist(directory) img = X[example_image_idx, :].reshape((28, 28)) kernel_size = (2, 1) img_pooled = np.zeros((int(np.ceil(img.shape[0] / kernel_size[0])), int(np.ceil(img.shape[1] / kernel_size[1])))) for x in range(img_pooled.shape[0]): for y in range(img_pooled.shape[1]): x_min = x * kernel_size[0] x_max = x_min + kernel_size[0] y_min = y * kernel_size[1] y_max = y_min + kernel_size[1] img_pooled[x, y] = np.max(img[x_min:x_max, y_min:y_max]) plt.imsave(os.path.join( directory, 'pooled_max_kernel{0}x{1}.png'.format(kernel_size[0], kernel_size[1])), img_pooled, cmap=plt.cm.gray_r) return
def plot_covariance(directory, *args, **kwargs): X, y = get_mnist(directory) cov = np.cov((X - np.mean(X, axis=0)).T) cov_w, cov_v = np.linalg.eigh(cov) # cov_pca_comp = cov_v.T n_components = 784 cov_PCA_alternative = np.flip(np.cov(np.matmul(cov_v.T, X.T)), axis=(0, 1)) cov_v_ordered = np.flip(cov_v, axis=(0, 1)) # plt.imsave(os.path.join(directory, 'mnist-cov-pca-alt-db.png'), # 20 * np.log10(np.abs(cov_PCA_alternative) + 1.), # cmap=plt.cm.gray_r) # pca = PCA().fit(X) cov_PCA = cov_PCA_alternative fig, axs = plt.subplots(1, 3, figsize=(6, 2.5)) if isinstance(axs, plt.Axes): axs = [axs] axs[0].imshow(np.resize(cov, (X.shape[1], X.shape[1])), cmap=plt.cm.gray_r, interpolation='none') axs[0].set_title('covariance') axs[0].set_xticks(np.arange(start=0, stop=785, step=28)) axs[0].set_xticklabels('{0:.0f}'.format(x) if x in [0, 784] else '' for x in np.arange(start=0, stop=785, step=28)) axs[0].set_yticks(np.arange(start=0, stop=785, step=28)) axs[0].set_yticklabels('{0:.0f}'.format(x) if x in [0, 784] else '' for x in np.arange(start=0, stop=785, step=28)) axs[1].imshow(np.resize(20 * np.log10(np.abs(cov_PCA) + 1.), (n_components, n_components)), cmap=plt.cm.gray_r, interpolation='none') axs[1].set_title('after PCA ({0})'.format(n_components)) axs[1].set_xticks( np.append(np.arange(start=0, stop=n_components, step=28), n_components)) axs[1].set_xticklabels( np.append([ '{0:.0f}'.format(x) if x == 0 else '' for x in np.arange(start=0, stop=n_components, step=28) ], '{0}'.format(n_components))) axs[1].set_yticks( np.append(np.arange(start=0, stop=n_components, step=28), n_components)) axs[1].set_yticklabels( np.append([ '{0:.0f}'.format(x) if x == 0 else '' for x in np.arange(start=0, stop=n_components, step=28) ], '{0}'.format(n_components))) axs[2].imshow(20 * np.log10(np.abs(cov_v_ordered.T) + 1.), cmap=plt.cm.gray_r, interpolation='none') axs[2].set_title('PCA components') axs[2].set_xticks(np.arange(start=0, stop=785, step=28)) axs[2].set_xticklabels('{0:.0f}'.format(x) if x in [0, 784] else '' for x in np.arange(start=0, stop=785, step=28)) axs[2].set_yticks(np.arange(start=0, stop=785, step=28)) axs[2].set_yticklabels('{0:.0f}'.format(x) if x in [0, 784] else '' for x in np.arange(start=0, stop=785, step=28)) def scale(A): return (A - np.min(A)) / (np.max(A) - np.min(A)) for idx in [0, 1, 2, 3, 4, 5, 20, 50, 100, 200, 400, 600, 701, 783]: filepath = os.path.join( directory, '{0}{1}.png'.format('mnist-covariance-eig', idx)) plt.imsave(filepath, cov_v_ordered.T[idx, ...].reshape(28, 28), cmap=plt.cm.gray_r) fig.tight_layout() # fig.show() fig.savefig(os.path.join(directory, 'mnist-covariance.pdf'), format='pdf') fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-covariance.pgf'), format='pgf') plt.imsave(os.path.join(directory, 'mnist-covariance.png'), np.resize(cov, (X.shape[1], X.shape[1])), cmap=plt.cm.gray_r) plt.imsave(os.path.join(directory, 'mnist-covariance-pca.png'), np.resize(cov_PCA, (n_components, n_components)), cmap=plt.cm.gray_r) plt.imsave(os.path.join(directory, 'mnist-pca-components.png'), cov_v_ordered.T, cmap=plt.cm.gray_r) plt.imsave(os.path.join(directory, 'mnist-covariance-db.png'), np.resize(20 * np.log10(np.abs(cov) + 1.), (X.shape[1], X.shape[1])), cmap=plt.cm.gray_r) plt.imsave(os.path.join(directory, 'mnist-covariance-pca-db.png'), np.resize(20 * np.log10(np.abs(cov_PCA) + 1.), (n_components, n_components)), cmap=plt.cm.gray_r) plt.imsave(os.path.join(directory, 'mnist-pca-components-db.png'), 20 * np.log10(np.abs(cov_v_ordered.T) + 1.), cmap=plt.cm.gray_r, vmax=.5) return
def silhouette_features(directory, *args, **kwargs): logger = new_logger('plot_silhouette_features', directory) logger.info('entering') X, y = get_mnist(directory) X /= 255. X = X[:10000, ...] scaler = StandardScaler().fit(X) pca = PCA(whiten=False, random_state=42).fit(X) X_pca = pca.transform(X) # sort scaler variances variance_indices = np.argsort(scaler.var_)[::-1] n_features_list = [ 1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 784 ] rs = np.random.RandomState(42) k = 20 dict_results = { 'nfeatures': [], 'fittime_random': [], 'fittime_maxvar': [], 'fittime_pca': [], 'silhouette_random': [], 'silhouette_maxvar': [], 'silhouette_pca': [], 'explainvar_random': [], 'explainvar_maxvar': [], 'explainvar_pca': [], 'explvarrat_random': [], 'explvarrat_maxvar': [], 'explvarrat_pca': [], 'n_clusters': [], } for n_features in n_features_list: clusterer = KMeans(n_clusters=k, random_state=42) dict_results['nfeatures'].append(n_features) dict_results['n_clusters'].append(clusterer.n_clusters) indices = rs.choice(X.shape[1], size=n_features) t = time.time() pred = clusterer.fit_predict(X[:, indices]) dict_results['fittime_random'].append(time.time() - t) dict_results['silhouette_random'].append( silhouette_score(X, pred, metric='euclidean', random_state=42)) dict_results['explainvar_random'].append(np.sum(scaler.var_[indices])) dict_results['explvarrat_random'].append( np.sum(scaler.var_[indices]) / np.sum(scaler.var_)) t = time.time() indices = variance_indices[:n_features] pred = clusterer.fit_predict(X[:, indices]) dict_results['fittime_maxvar'].append(time.time() - t) dict_results['silhouette_maxvar'].append( silhouette_score(X, pred, metric='euclidean', random_state=42)) dict_results['explainvar_maxvar'].append(np.sum(scaler.var_[indices])) dict_results['explvarrat_maxvar'].append( np.sum(scaler.var_[indices]) / np.sum(scaler.var_)) t = time.time() pred = clusterer.fit_predict(X_pca[:, :n_features]) dict_results['fittime_pca'].append(time.time() - t) dict_results['silhouette_pca'].append( silhouette_score(X, pred, metric='euclidean', random_state=42)) dict_results['explainvar_pca'].append( np.sum(pca.explained_variance_[:n_features])) dict_results['explvarrat_pca'].append( np.sum(pca.explained_variance_ratio_[:n_features])) logger.info('pca silhouette at n_features={1:.0f}: {0}'.format( dict_results['silhouette_pca'][-1], n_features)) # save results to csv with open( os.path.join(directory, 'silhouette_kmeans{0:.0f}_features.csv'.format(k)), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def plot_pca(directory, *args, **kwargs): X, y = get_mnist(directory) fig, axs = plt.subplots(1, 5, figsize=(6, 1.5), gridspec_kw={ 'wspace': 0.45, 'left': .05, 'right': .95, 'bottom': .0, 'top': .90 }) # automatic pca decomposer = PCA(whiten=False).fit(X) # original axs[0].imshow(np.resize(X[example_image_idx, ...], (28, 28)), cmap=plt.cm.gray_r, interpolation='none') axs[0].set_title('original') # mean axs[1].imshow(np.resize(decomposer.mean_, (28, 28)), cmap=plt.cm.gray_r, interpolation='none') axs[1].set_title('average: {0}'.format(100)) # pca 50, average free X_avgfree = X - np.mean(X, axis=0) M_pca = decomposer.components_[:50, :].T M = np.dot(M_pca, M_pca.T) # transformation and inverse combined axs[2].imshow(np.resize(np.dot(X_avgfree[example_image_idx, ...], M), (28, 28)), cmap=plt.cm.gray_r, interpolation='none') axs[2].set_title('n={0}\naverage free'.format(M_pca.shape[1])) # pca 50, not average free axs[3].imshow(np.resize(np.dot(X[example_image_idx, ...], M), (28, 28)), cmap=plt.cm.gray_r, interpolation='none') axs[3].set_title('n={0}\nwith average'.format(M_pca.shape[1])) # pca 25, not average free M_pca = decomposer.components_[:25, :].T M = np.dot(M_pca, M_pca.T) # transformation and inverse combined axs[4].imshow(np.resize(np.dot(X[example_image_idx, ...], M), (28, 28)), cmap=plt.cm.gray_r, interpolation='none') axs[4].set_title('n={0}\nwith average'.format(M_pca.shape[1])) for idx in range(5): axs[idx].set_xticks([0, 27]) axs[idx].set_xticklabels([0, 27]) axs[idx].set_yticks([0, 27]) axs[idx].set_yticklabels([0, 27]) # fig.tight_layout() # fig.show() fig.savefig(os.path.join(directory, 'mnist-pca-effects.pdf'), format='pdf') fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pca-effects.pgf'), format='pgf') return
def elm_pca(directory): self_name = 'elm_pca' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) # scale X X /= 255. # split train test X_train, X_test, y_train, y_test = train_test_split(X[:train_size], y[:train_size], train_size=50000, random_state=42) # prepare parameter grids param_grid_basic = { 'hidden_layer_size': 2000, 'input_scaling': 1., 'bias_scaling': 0., 'input_activation': 'relu', 'alpha': 1e-5, 'random_state': 42 } # setup estimator estimator = ELMClassifier(regressor=Ridge()) # initialize filepath filepath = os.path.join(directory, '{0}_basic.csv'.format(self_name)) # initialize param dict param_dict_job = estimator.get_params().copy() param_dict_job.update(param_grid_basic) # initialize results dict results_dict_job = param_dict_job.copy() # add dummy results results_dict_job.update({ 'time_fit': 0, 'time_pred': 0, 'score': 0, 'pca_n_components': 0 }) # preprocessing pca try: # write header with open(filepath, 'w') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writeheader() for pca_n_components in [10, 20, 50, 100, 200, 500, 784]: results_dict_job.update({'pca_n_components': pca_n_components}) estimator.set_params(**param_dict_job) # preprocessing pca = PCA(n_components=pca_n_components).fit(X_train) X_train_pca, X_test_pca = \ pca.transform(X_train), pca.transform(X_test) # run! time_start = time.time() estimator.fit(X_train_pca, y_train) time_fit = time.time() y_pred = estimator.predict(X_test_pca) time_pred = time.time() # run end! results_dict_job.update({ 'time_fit': time_fit - time_start, 'time_pred': time_pred - time_fit, 'score': accuracy_score(y_test, y_pred) }) logger.info('pca.n_components_: {0}, score: {1}'.format( pca_n_components, results_dict_job['score'])) with open(filepath, 'a') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writerow(results_dict_job) except MemoryError as e: logger.error('Memory error: {0}'.format(e)) except PermissionError as e: logger.error('Missing privileges: {0}'.format(e)) except Exception as e: logger.error('Unexpected exception: {0}'.format(e))
def elm_preprocessed(directory): self_name = 'elm_preprocessed' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # preprocessing X /= 255. pca = PCA(n_components=50).fit(X) X_preprocessed = pca.transform(X) logger.info('{0} features remaining after preprocessing.'.format( X_preprocessed.shape[1])) # train test split X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_encoded, train_size=train_size, random_state=42) # prepare parameter grid param_grid = [{ 'hidden_layer_size': [500, 2000], 'input_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'bias_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'input_activation': ['relu'], 'alpha': [1e-5], 'random_state': [42] }, { 'hidden_layer_size': [2000], 'input_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'bias_scaling': np.logspace(start=-3, stop=1, base=10, num=6), 'input_activation': ['tanh'], 'alpha': [1e-5], 'random_state': [42] }] # setup estimator estimator = ELMClassifier(regressor=Ridge()) # setup grid search cv = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', n_jobs=1, verbose=2, refit=False, cv=StratifiedShuffleSplit(n_splits=1, test_size=1 / 7, random_state=42)) # run! cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) # refine results cv_results = cv.cv_results_ del cv_results['params'] # save results try: with open(os.path.join(directory, 'elm_preprocessed.csv'), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') except PermissionError as e: print('Missing privileges: {0}'.format(e))
def silhouette_n_clusters(directory, *args, **kwargs): logger = new_logger('plot_silhouette_n_clusters', directory) logger.info('entering') X, y = get_mnist(directory) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) scaler = StandardScaler().fit(X) X /= 255. pca = PCA(n_components=50, whiten=False, random_state=42).fit(X) min_var = 3088.6875 # reduce train size # X = X[:10000, ...] X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, train_size=10000, random_state=42) # variance threshold X_var_threshold = X_train[..., scaler.var_ > min_var] # pca X_pca = pca.transform(X_train) # n_clusters k = [ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 40, 50, 60, 70, 80, 90, 100, 200, 500, 1000, 2000, 4000 ] # n_init n_init = 10 dict_results = { 'n_clusters': [], 'n_init': [], 'variance_threshold': [], 'pca_n_components': [], 'pca_explained_variance': [], 'pca_explained_variance_ratio': [], 'silhouette_original': [], 'silhouette_variance_threshold': [], 'silhouette_pca': [], 'fittime_original': [], 'fittime_variance_threshold': [], 'fittime_pca': [], 'inertia_original': [], 'inertia_variance_threshold': [], 'inertia_pca': [], 'n_iter_original': [], 'n_iter_variance_threshold': [], 'n_iter_pca': [] } for n_clusters in k: dict_results['n_clusters'].append(n_clusters) dict_results['n_init'].append(n_init) dict_results['variance_threshold'].append(min_var) dict_results['pca_n_components'].append(pca.n_components_) dict_results['pca_explained_variance'].append( np.sum(pca.explained_variance_)) dict_results['pca_explained_variance_ratio'].append( np.sum(pca.explained_variance_ratio_)) clusterer = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=n_init, random_state=42) # original t = time.time() clusterer.fit(X_train) dict_results['fittime_original'].append(time.time() - t) dict_results['inertia_original'].append(clusterer.inertia_) dict_results['n_iter_original'].append(clusterer.n_iter_) dict_results['silhouette_original'].append( silhouette_score(X_train, clusterer.predict(X_train), metric='euclidean', random_state=42)) np.save('./cluster_critical.npy', clusterer.cluster_centers_) # var threshold t = time.time() clusterer.fit(X_var_threshold) dict_results['fittime_variance_threshold'].append(time.time() - t) dict_results['inertia_variance_threshold'].append(clusterer.inertia_) dict_results['n_iter_variance_threshold'].append(clusterer.n_iter_) dict_results['silhouette_variance_threshold'].append( silhouette_score(X_train, clusterer.predict(X_var_threshold), metric='euclidean', random_state=42)) # pca t = time.time() clusterer.fit(X_pca) dict_results['fittime_pca'].append(time.time() - t) dict_results['inertia_pca'].append(clusterer.inertia_) dict_results['n_iter_pca'].append(clusterer.n_iter_) dict_results['silhouette_pca'].append( silhouette_score(X_train, clusterer.predict(X_pca), metric='euclidean', random_state=42)) logger.info('n_clusters = {0}, pca kmeans score: {1}'.format( n_clusters, dict_results['silhouette_pca'][-1])) logger.info('n_clusters = {0}'.format(n_clusters)) # save results to csv with open(os.path.join(directory, 'silhouette_n_clusters.csv'), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def plot_historgram(directory, *args, **kwargs): logger = new_logger('plot_historgram', directory) logger.info('entering') X, y = get_mnist(directory) fig, axs = plt.subplots(1, 2, figsize=(5, 2), gridspec_kw={'width_ratios': [1, 1.7]}) example = np.zeros((28, 28, 3)) example[..., 0] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # red example[..., 1] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # green example[..., 2] = 1. - np.resize(X[example_image_idx, :], (28, 28)) / 255. # blue idx_fringe = (25, 17) idx_center = (13, 12) example[idx_center[0], idx_center[1], :] = tud_colors['lightblue'][:-1] example[idx_fringe[0], idx_fringe[1], :] = tud_colors['orange'][:-1] bins = np.array(range(0, 287, 32)).astype(int) hist_fringe, bin_edges = np.histogram(X[:, idx_fringe[0] * 28 + idx_fringe[1]], bins=bins) hist_center, bin_edges = np.histogram(X[:, idx_center[0] * 28 + idx_center[1]], bins=bins) logger.info('validation sum hist_fringe: {0}, sum hist_center: {1}'.format( np.sum(hist_fringe / 1000), np.sum(hist_center / 1000))) axs[0].imshow(example, interpolation='none') axs[0].set_xticks([0, 27]) axs[0].set_xticklabels([0, 27]) axs[0].set_yticks([0, 27]) axs[0].set_yticklabels([0, 27]) axs[1].bar(bins[1:] - 32, height=hist_fringe / 1000, width=16, color=tud_colors['orange'], label='fringe', align='edge') axs[1].bar(bins[1:] - 16, height=hist_center / 1000, width=16, color=tud_colors['lightblue'], label='center', align='edge') axs[1].tick_params(axis='x', labelrotation=90) # axs[1].hist([], bins=range(0, 255, 32), color=[tud_colors['orange'], # tud_colors['lightblue']], # align='left') axs[1].set_xticks(bins) # axs[1].legend(bbox_to_anchor=(0, 1, 1, 0), loc="lower left", # mode="expand", ncol=2) axs[1].legend(bbox_to_anchor=(1.0, .5), loc="center left") # fig.suptitle('Feature distribution in MNIST picture') axs[1].set_xlabel('value bins') axs[1].set_ylabel('probability') fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-pixel-histogram.pdf')) fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pixel-histogram.pgf'), format='pgf') # plt.show() return
def elm_hidden_layer_size(directory): self_name = 'elm_hidden_layer_size' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) # encode y label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # scale X X /= 255. # split train test X_train, X_test, y_train, y_test = (X[:train_size, :], X[train_size:, :], y_encoded[:train_size], y_encoded[train_size:]) # fan-out from paper fan_out = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20] # prepare parameter grids param_grid_basic = { 'hidden_layer_size': 0, 'input_scaling': 1., 'bias_scaling': 0., 'activation': 'relu', 'chunk_size': 1000, 'alpha': 1e-5, 'random_state': 42 } param_grid_pca = { 'hidden_layer_size': 0, 'input_scaling': 1., 'bias_scaling': 0., 'activation': 'relu', 'chunk_size': 1000, 'alpha': 1e-5, 'random_state': 42 } # setup estimator estimator = ELMClassifier() # basic try: # initialize filepath csv_filepath = os.path.join(directory, '{0}_basic.csv'.format(self_name)) # initialize param dict param_dict_job = estimator.get_params().copy() param_dict_job.update(param_grid_basic) # initialize results dict results_dict_job = param_dict_job.copy() # add dummy results results_dict_job.update({'time_fit': 0, 'time_pred': 0, 'score': 0}) # write header with open(csv_filepath, 'w') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writeheader() for hls in 784 * np.array(fan_out): param_dict_job.update({'hidden_layer_size': hls}) estimator.set_params(**param_dict_job) # run! time_start = time.time() estimator.fit(X_train, y_train) time_fit = time.time() y_pred = estimator.predict(X_test) time_pred = time.time() # run end! results_dict_job.update(estimator.get_params()) results_dict_job.update({ 'time_fit': time_fit - time_start, 'time_pred': time_pred - time_fit, 'score': accuracy_score(y_test, y_pred) }) logger.info('hidden_layer_size: {0}, score: {1}'.format( hls, results_dict_job['score'])) with open(csv_filepath, 'a') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writerow(results_dict_job) del estimator.input_to_node._hidden_layer_state with open( os.path.join(directory, 'elmc_hls{0}_basic.pickle'.format(hls)), 'wb') as f: pickle.dump(estimator, f) except MemoryError as e: logger.error('Memory error: {0}'.format(e)) pass except PermissionError as e: logger.error('Missing privileges: {0}'.format(e)) pass # preprocessing pca try: # initialize filepath csv_filepath = os.path.join(directory, '{0}_pca.csv'.format(self_name)) # preprocessing pca50 = PCA(n_components=50).fit(X_train) X_train_pca50, X_test_pca50 = (pca50.transform(X_train), pca50.transform(X_test)) pca100 = PCA(n_components=100).fit(X_train) X_train_pca100, X_test_pca100 = (pca100.transform(X_train), pca100.transform(X_test)) list_dict_pca = [{ 'n_components': 50, 'X_train': X_train_pca50, 'X_test': X_test_pca50 }, { 'n_components': 100, 'X_train': X_train_pca100, 'X_test': X_test_pca100 }] logger.info('Preprocessing successful!') # initialize param dict param_dict_job = estimator.get_params().copy() param_dict_job.update(param_grid_pca) # initialize results dict results_dict_job = param_dict_job.copy() # add dummy results results_dict_job.update({ 'time_fit': 0, 'time_pred': 0, 'score': 0, 'pca_n_components': 0 }) # write header with open(csv_filepath, 'w') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writeheader() for dict_pca in list_dict_pca: results_dict_job.update( {'pca_n_components': dict_pca['n_components']}) for hls in np.concatenate( (100 * np.array(fan_out), 784 * np.array(fan_out)), axis=0): param_dict_job.update({'hidden_layer_size': hls}) estimator.set_params(**param_dict_job) # run! time_start = time.time() estimator.fit(dict_pca['X_train'], y_train) time_fit = time.time() y_pred = estimator.predict(dict_pca['X_test']) time_pred = time.time() # run end! results_dict_job.update(estimator.get_params()) results_dict_job.update({ 'time_fit': time_fit - time_start, 'time_pred': time_pred - time_fit, 'score': accuracy_score(y_test, y_pred) }) logger.info( 'n_components: {2}, hidden_layer_size: {0}, score:' ' {1}'.format(hls, results_dict_job['score'], results_dict_job['pca_n_components'])) with open(csv_filepath, 'a') as f: writer = csv.DictWriter(f, fieldnames=results_dict_job.keys()) writer.writerow(results_dict_job) with open( os.path.join( directory, 'elmc_hls{0}_pca{1}.pickle'.format( hls, results_dict_job['pca_n_components'])), 'wb') as f: pickle.dump(estimator, f) except MemoryError as e: logger.error('Memory error: {0}'.format(e)) pass except PermissionError as e: logger.error('Missing privileges: {0}'.format(e)) pass
def silhouette_subset(directory, *args, **kwargs): logger = new_logger('plot_silhouette_subset', directory) logger.info('entering') X, y = get_mnist(directory) X /= 255. pca = PCA(n_components=50, whiten=False, random_state=42) # preprocessing X_pca = pca.fit_transform(X) # define subset sizes subset_sizes = [250, 500, 1000, 2000, 4000, 8000, 16000, 32000, 60000] # number of centroids k_list = [20] dict_results = { 'subset_size': [], 'k': [], 'n_init': [], 'silhouette_raninit': [], 'silhouette_preinit': [], 'fittime_raninit': [], 'fittime_preinit': [], 'scoretime_raninit': [], 'scoretime_preinit': [] } for k in k_list: # preinit # initial training set X_train, X_test, y_train, y_test = train_test_split( X_pca, y, random_state=42, train_size=subset_sizes[0], shuffle=True, stratify=y) clusterer_init = KMeans(n_clusters=k, random_state=42, init='k-means++', n_init=10).fit(X_train) # random inits clusterer = KMeans(n_clusters=k, n_init=10, random_state=42) for subset_size in subset_sizes: # split on subset size dict_results['subset_size'].append(subset_size) X_train, X_test, y_train, y_test = train_test_split( X_pca, y, random_state=42, train_size=subset_size, shuffle=True, stratify=y) # train preinit t = time.time() clusterer_init = KMeans(n_clusters=k, random_state=42, n_init=1, init=clusterer_init.cluster_centers_) clusterer_init.fit_predict(X_train) dict_results['fittime_preinit'].append(time.time() - t) # score preinit t = time.time() dict_results['silhouette_preinit'].append( silhouette_score(X_train, clusterer_init.predict(X_train), metric='euclidean', random_state=42)) dict_results['scoretime_preinit'].append(time.time() - t) # train randinit t = time.time() clusterer.fit(X_train) dict_results['fittime_raninit'].append(time.time() - t) # score raninit t = time.time() dict_results['silhouette_raninit'].append( silhouette_score(X_train, clusterer.predict(X_train), metric='euclidean', random_state=42)) dict_results['scoretime_raninit'].append(time.time() - t) # store results dict_results['k'].append(k) dict_results['n_init'].append(clusterer.n_init) logger.info('silhouette (preinit) at subset size {1}: {0}'.format( dict_results['silhouette_preinit'][-1], dict_results['subset_size'][-1])) # save results to csv with open(os.path.join(directory, 'silhouette_kmeans_subset_size.csv'), 'w') as f: f.write(','.join(dict_results.keys()) + '\n') for row in list(map(list, zip(*dict_results.values()))): f.write(','.join(map(str, row)) + '\n') return
def elm_coates(directory): self_name = 'elm_coates' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) filepath_label_encoder = os.path.join( directory, 'label_encoder_{0}.pickle'.format(self_name)) # save label_encoder try: with open(filepath_label_encoder, 'wb') as f: pickle.dump(label_encoder, f) except Exception as e: logger.error('Unexpected error: {0}'.format(e)) exit(1) # scale X so X in [0, 1] X /= 255. X_train, X_test, y_train, y_test = (X[:train_size, ...], X[train_size:], y_encoded[:train_size], y_encoded[train_size:]) csv_filepath = os.path.join(directory, '{0}.csv'.format(self_name)) # read input matrices from files list_filepaths = [] for filepath in glob.glob( os.path.join(directory, '*pca*+kmeans*_matrix.npy')): logger.info('matrix file found: {0}'.format(filepath)) list_filepaths.append(filepath) filename = os.path.splitext(os.path.basename(filepath))[0] est_filepath = os.path.join(directory, 'est_coates-{0}.pickle'.format(filename)) pred_filpath = os.path.join( directory, 'est_coates-{0}-predicted.npz'.format(filename)) # only if files do not exist yet if (not os.path.isfile(csv_filepath) or not os.path.isfile(est_filepath) or not os.path.isfile(pred_filpath)): # setup estimator estimator = ELMClassifier( input_to_node=PredefinedWeightsInputToNode( predefined_input_weights=np.load(filepath), input_scaling=1.0, bias_scaling=0.0, input_activation='relu', random_state=42), chunk_size=1000) logger.info('Estimator params: {0}'.format( estimator.get_params().keys())) # !run time_start = time.time() estimator.fit(X_train, y_train) time_fitted = time.time() y_pred = estimator.predict(X_test) time_predicted = time.time() # !run # results dict_results = estimator.get_params() dict_results.update({ 'filename': filename, 'fit_time': time_fitted - time_start, 'score_time': time_predicted - time_fitted, 'score': accuracy_score(y_test, y_pred) }) # drop data dict_results.pop('input_to_nodes__predefined_input_weights') dict_results.pop('input_to_nodes') dict_results.pop('regressor') logger.info('fitted time {1}, score on test set: {0}'.format( dict_results['score'], dict_results['fit_time'])) # save estimator try: with open(est_filepath, 'wb') as f: pickle.dump(estimator, f) except Exception as e: logger.error('Unexpected error: {0}'.format(e)) exit(1) # save results try: if not os.path.isfile(csv_filepath): with open(csv_filepath, 'a') as f: f.write(','.join(dict_results.keys())) f.write('\n') f.write(','.join( [str(item) for item in dict_results.values()])) f.write('\n') else: with open(csv_filepath, 'a') as f: f.write(','.join( [str(item) for item in dict_results.values()])) f.write('\n') except PermissionError as e: print('Missing privileges: {0}'.format(e)) # save prediction np.savez_compressed(pred_filpath, X_test=X_test, y_test=label_encoder.inverse_transform(y_test), y_pred=label_encoder.inverse_transform(y_pred)) if not list_filepaths: logger.warning('no input weights matrices found') return
def test_get_mnist() -> None: X, y = get_mnist(os.getcwd()) assert X.shape[0] == 70000
def elm_hyperparameters(directory): self_name = 'elm_hyperparameters' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) X = X / 255. label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # X_train, X_test, y_train, y_test = train_test_split( # X, y_encoded, train_size=train_size, random_state=42, shuffle=True) X_train, _, y_train, _ = (X[:train_size, :], X[train_size:, :], y_encoded[:train_size], y_encoded[train_size:]) param_grid = { 'hidden_layer_size': [2000], 'input_scaling': np.logspace(start=-2, stop=2, base=10, num=7), 'bias_scaling': np.logspace(start=-2, stop=2, base=10, num=7), 'input_activation': ['tanh'], 'alpha': [1e-5], 'random_state': [42] } estimator = ELMClassifier(regressor=Ridge()) cv = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1, scoring='accuracy') cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) cv_results = cv.cv_results_ del cv_results['params'] with open(os.path.join(directory, '{0}_scaling.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') param_grid = { 'hidden_layer_size': [500, 1000, 2000, 4000], 'input_scaling': [cv.best_params_['input_scaling']], 'bias_scaling': [cv.best_params_['bias_scaling']], 'input_activation': ['tanh', 'relu', 'bounded_relu', 'logistic', 'identity'], 'alpha': [1e-5], 'random_state': [42] } cv = GridSearchCV(estimator, param_grid, cv=5, n_jobs=-1, scoring='accuracy') cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) cv_results = cv.cv_results_ del cv_results['params'] with open(os.path.join(directory, '{0}_size.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') param_grid = { 'hidden_layer_size': [cv.best_params_['hidden_layer_size']], 'input_scaling': [cv.best_params_['input_scaling']], 'bias_scaling': [cv.best_params_['bias_scaling']], 'input_activation': [cv.best_params_['input_activation']], 'alpha': [.00001, .001, .1], 'random_state': [42] } cv = GridSearchCV(estimator, param_grid, cv=5, n_jobs=1, scoring='accuracy') cv.fit(X_train, y_train) logger.info('best parameters: {0} (score: {1})'.format( cv.best_params_, cv.best_score_)) cv_results = cv.cv_results_ del cv_results['params'] with open(os.path.join(directory, '{0}_alpha.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n')
def train_kmeans(directory): self_name = 'train_kmeans' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) # scale X, so $X \in [0, 1]$ X /= 255. list_n_components = [50] # [50, 100] # [20, 50, 100, 200, 500, 1000, 2000, 4000, 8000, 16000] list_n_clusters = [200] for n_components in list_n_components: pca = PCA(n_components=n_components, random_state=42).fit(X) X_pca = pca.transform(X) logger.info('pca{0}: explained variance ratio = {1}'.format( n_components, np.sum(pca.explained_variance_ratio_))) for n_clusters in list_n_clusters: # minibatch kmeans kmeans_basename = 'minibatch-pca{0}+kmeans{1}'.format( n_components, n_clusters) # only if file does not exist if not os.path.isfile( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename))): clusterer = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', random_state=42, batch_size=5000, n_init=5).fit(X_pca) np.save( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename)), np.dot(pca.components_.T, clusterer.cluster_centers_.T)) # assemble pipeline p = make_pipeline(pca, clusterer) with open( os.path.join( directory, '{0}_pipeline.pickle'.format(kmeans_basename)), 'wb') as f: pickle.dump(p, f) logger.info( 'successfuly trained MiniBatchKMeans' 'and saved to npy/pickle {0}'.format(kmeans_basename)) # original kmeans kmeans_basename = 'original-pca{0}+kmeans{1}'.format( n_components, n_clusters) if n_clusters < 2000 and not os.path.isfile( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename))): clusterer = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42, n_init=5).fit(X_pca) np.save( os.path.join(directory, '{0}_matrix.npy'.format(kmeans_basename)), np.dot(pca.components_.T, clusterer.cluster_centers_.T)) # assemble pipeline p = make_pipeline(pca, clusterer) with open( os.path.join( directory, '{0}_pipeline.pickle'.format(kmeans_basename)), 'wb') as f: pickle.dump(p, f) logger.info('successfuly trained KMeans and saved to' 'npy/pickle {0}'.format(kmeans_basename))
def main(): runtime = [time.time()] X, y = get_mnist() runtime.append(time.time()) print('fetch: {0} s'.format(np.diff(runtime[-2:]))) scaler = StandardScaler() whitener = PCA(50, random_state=42) X /= 255. # X, y = X[:1000, :], y[:1000] X_preprocessed = whitener.fit_transform(X) runtime.append(time.time()) print('preprocessing: {0} s'.format(np.diff(runtime[-2:]))) cls = KMeans(n_clusters=10, random_state=42).fit(X_preprocessed) runtime.append(time.time()) print('clustering: {0} s'.format(np.diff(runtime[-2:]))) samples, values = get_unique(X_preprocessed, y) # define norm p2norm = lambda x: np.linalg.norm(x, axis=1, ord=2) # reconstruct cluster centers cluster_centers = whitener.inverse_transform(cls.cluster_centers_) cluster_center_norm = p2norm(cluster_centers) # normed_samples = (samples.T / np.linalg.norm(samples, axis=1, ord=2)).T # calculate distance cos_similarity = np.divide(np.dot(samples, cls.cluster_centers_.T), p2norm(samples) * p2norm(cls.cluster_centers_)) runtime.append(time.time()) print('calculations: {0} s'.format(np.diff(runtime[-2:]))) # display digits fig = plt.figure(figsize=(6, 3)) gs_cyphers = gridspec.GridSpec(2, 5, figure=fig, wspace=.4, hspace=.3, top=.97, bottom=.1, left=.07, right=.95) for i in range(10): gs_cypher = gridspec.GridSpecFromSubplotSpec( 2, 1, subplot_spec=gs_cyphers[i], height_ratios=[1., .6], hspace=.05) ax_centroid = fig.add_subplot(gs_cypher[0, 0]) # axs[(i // 5) * 2, i % 5] ax_barchart = fig.add_subplot( gs_cypher[1, 0]) # axs[(i // 5) * 2 + 1, i % 5] ax_centroid.imshow(cluster_centers[i, :].reshape(28, 28), interpolation='none') ax_centroid.tick_params(left=False, bottom=False, labelleft=False, labelbottom=False) ax_barchart.bar(list(map(int, values)), cos_similarity[:, i], tick_label=values, color=tud_colors['lightblue']) # ax_barchart.set_xlim([0, 9]) ax_barchart.grid(which='both', axis='y') ax_barchart.set_yticks([-1., 0., 1.], minor=False) ax_barchart.set_yticks([-.5, .5], minor=True) ax_barchart.set_ylim([-1., 1.]) # plt.tight_layout() plt.savefig( 'mnist-kmeans-centroids-cos-similarity-pca50.pdf') # plt.show() # plt.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pca50-kmeans-centroids-cos-similarity.pgf'), format='pgf') runtime.append(time.time()) print('plotting: {0} s'.format(np.diff(runtime[-2:])))
def elm_coates_stacked(directory): self_name = 'elm_coates_stacked' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # scale X so X in [0, 1] X /= 255. # setup parameter grid param_grid = { 'chunk_size': [10000], 'input_scaling': np.logspace(start=-3, stop=1, base=10, num=3), 'bias_scaling': [0.], # np.logspace(start=-3, stop=1, base=10, num=6), 'input_activation': ['relu'], 'alpha': [1e-5], 'random_state': [42] } # read input matrices from files list_filepaths = [] predefined_input_weights = np.empty((784, 0)) for filepath in glob.glob(os.path.join(directory, '*kmeans1*matrix.npy')): logger.info('matrix file found: {0}'.format(filepath)) list_filepaths.append(filepath) predefined_input_weights = np.append(predefined_input_weights, np.load(filepath), axis=1) # setup estimator estimator = ELMClassifier( PredefinedWeightsInputToNode( predefined_input_weights=predefined_input_weights), IncrementalRegression()) logger.info('Estimator params: {0}'.format(estimator.get_params().keys())) # return # setup grid search cv = GridSearchCV(estimator=estimator, param_grid=param_grid, scoring='accuracy', n_jobs=1, verbose=1, cv=[(np.arange(0, train_size), np.arange(train_size, 70000))]) # run! cv.fit(X, y_encoded) cv_best_params = cv.best_params_ del cv_best_params['input_to_nodes__predefined_input_weights'] # refine best params logger.info('best parameters: {0} (score: {1})'.format( cv_best_params, cv.best_score_)) # refine results cv_results = cv.cv_results_ del cv_results['params'] del cv_results['param_input_to_nodes__predefined_input_weights'] # save results try: with open(os.path.join(directory, '{0}.csv'.format(self_name)), 'w') as f: f.write(','.join(cv_results.keys()) + '\n') for row in list(map(list, zip(*cv_results.values()))): f.write(','.join(map(str, row)) + '\n') except PermissionError as e: print('Missing privileges: {0}'.format(e)) if not list_filepaths: logger.warning('no input weights matrices found') return
def input2node_distribution(directory): X, y = get_mnist(directory) X /= 255. pca = PCA(n_components=784).fit(X) X_pca = np.matmul(X, pca.components_.T) list_activation = ['tanh', 'relu', 'bounded_relu'] list_train = [X, X_pca] fig, axs = plt.subplots(nrows=2, ncols=3) for idx_activation in range(len(list_activation)): activation = list_activation[idx_activation] for idx_train in range(len(list_train)): ax = axs[idx_train, idx_activation] train = list_train[idx_train] if activation in ['tanh', '']: i2n = InputToNode(hidden_layer_size=1, random_state=82, input_scaling=50 / 784, bias_scaling=0., activation=activation) elif activation in ['relu', 'bounded_relu']: i2n = InputToNode(hidden_layer_size=1, random_state=82, input_scaling=1., bias_scaling=0., activation=activation) node_out = i2n.fit_transform(train, y) hist, bin_edges = np.histogram(node_out, bins=20, density=True) np.delete(bin_edges[:-1], hist <= 1e-3) np.delete(hist, hist <= 1e-3) if activation == 'bounded_relu': ax.hist(node_out, label=activation, density=True, bins=[.0, .1, .9, 1.], color=tud_colors['lightblue']) else: ax.hist(node_out, label=activation, density=True, bins=20, color=tud_colors['lightblue']) ax.grid(axis='y') ax.set_yscale('log') x_ticks = np.min(node_out), np.max(node_out) ax.set_xlim(x_ticks) if activation == 'tanh': x_ticks += (0.0, ) ax.set_xticks(x_ticks) ax.set_xticklabels( ['{0:.1f}'.format(x_tick) for x_tick in x_ticks]) axs[0, 0].set_title('tanh, orig.') axs[0, 1].set_title('relu, orig.') axs[0, 2].set_title('b. relu, orig.') axs[1, 0].set_title('tanh, pca') axs[1, 1].set_title('relu, pca') axs[1, 2].set_title('b. relu, pca') # plt.tight_layout() fig.tight_layout() fig.savefig(os.path.join(directory, 'node-out.pdf'), format='pdf') fig.savefig(os.path.join(directory, 'node-out.eps'), format='eps') plt.rc('pgf', texsystem='pdflatex')
def picture_gradient(directory): self_name = 'picture_gradient' logger = new_logger(self_name, directory=directory) X, y = get_mnist(directory) logger.info('Loaded MNIST successfully with {0} records'.format( X.shape[0])) label_encoder = LabelEncoder().fit(y) y_encoded = label_encoder.transform(y) # scale X so X in [0, 1] X /= 255. # reshape X X_images = X.reshape((X.shape[0], 28, 28)) list_kernels = [{ 'name': 'laplace', 'kernel': np.array([[-1., -1., -1.], [-1., 8, -1.], [-1., -1., -1.]]) }, { 'name': 'mexicanhat', 'kernel': np.array([[0., 0., -1., 0., 0.], [0., -1., -2., -1., 0.], [-1., -2., 16, -2., -1.], [0., -1., -2., -1., 0.], [0., 0., -1., 0., 0.]]) }, { 'name': 'v_prewitt', 'kernel': np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]]) }, { 'name': 'h_prewitt', 'kernel': np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]]).T }, { 'name': 'v_sobel', 'kernel': np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]]) }, { 'name': 'h_sobel', 'kernel': np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]]).T }] example_image_idx = 5 fig, axs = plt.subplots(1, 4, figsize=(6, 2)) axs[0].imshow(X_images[example_image_idx], cmap=plt.cm.gray_r, interpolation='none') axs[0].set_title('no filter') axs[1].imshow(convolve2d(X_images[example_image_idx], list_kernels[0]['kernel'], mode='same'), cmap=plt.cm.gray_r, interpolation='none') axs[1].set_title('laplace') axs[2].imshow(convolve2d(X_images[example_image_idx], list_kernels[2]['kernel'], mode='same'), cmap=plt.cm.gray_r, interpolation='none') axs[2].set_title('vertical\nprewitt') axs[3].imshow(convolve2d(X_images[example_image_idx], list_kernels[5]['kernel'], mode='same'), cmap=plt.cm.gray_r, interpolation='none') axs[3].set_title('horizontal\nsobel') for ax in axs: ax.set_xticks([0, 27]) ax.set_xticklabels([0, 27]) ax.set_yticks([0, 27]) ax.set_yticklabels([0, 27]) fig.tight_layout() fig.savefig(os.path.join(directory, 'mnist-image-filters.pdf'), format='pdf') fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-image-filters.pgf'), format='pgf')