Exemple #1
0
def plot_variance_mean(directory, *args, **kwargs):
    logger = new_logger('plot_variance_mean', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    image_size = (28, 28)

    scaler = StandardScaler().fit(
        StandardScaler(with_std=False).fit_transform(X) / 255)

    fig, axs = plt.subplots(1, 2, figsize=(8, 4))

    axs[0].imshow(np.resize(scaler.mean_, image_size),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[0].imshow(np.resize(scaler.var_, image_size),
                  cmap=plt.cm.gray_r,
                  interpolation='none')

    axs[0].set_title(r'$\mu$')
    axs[1].set_title(r'$\sigma^2$')

    fig.tight_layout()
    fig.savefig(
        os.path.join(directory, 'mnist-pixel-variance-and-mean-avgfree.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             'mnist-pixel-variance-and-mean-avgfree.pgf'),
                format='pgf')
    logger.info('np.max(scaler.mean_) = {0}, np.max(scaler.var_) = {1}'.format(
        np.max(scaler.mean_), np.max(scaler.var_)))
    return
Exemple #2
0
def plot_labels(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    # find first digit occurrences
    idx = np.ones((10, )) * -1
    cnt = int(0)
    while np.any(idx == -1):
        if idx[int(y[cnt])] == -1.0:
            idx[int(y[cnt])] = int(cnt)
        cnt += 1

    # display digits
    fig, axs = plt.subplots(2, 5, figsize=(5, 2))

    for i in range(10):
        axs[i // 5][i % 5].imshow(np.resize(X[int(idx[i])], (28, 28)),
                                  cmap=plt.cm.gray_r,
                                  interpolation='none')
        axs[i // 5][i % 5].set_xticks([0, 27])
        axs[i // 5][i % 5].set_xticklabels([0, 27])
        axs[i // 5][i % 5].set_yticks([0, 27])
        axs[i // 5][i % 5].set_yticklabels([0, 27])

    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-visualize.pgf'), format='pgf')
Exemple #3
0
def plot_img_cluster(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    img = X[example_image_idx, :]

    clusterer = KMeans(n_clusters=4, random_state=42)
    img_clusters = clusterer.fit_predict(img.reshape((784, 1))).reshape(
        (28, 28))
    list_cluster_colors = [
        tud_colors['lightblue'], tud_colors['lightgreen'],
        tud_colors['lightpurple'], tud_colors['gray']
    ]

    img_cluster_colors = np.zeros((28, 28, 4))

    for x in range(img_cluster_colors.shape[0]):
        for y in range(img_cluster_colors.shape[1]):
            img_cluster_colors[x, y, :] = list_cluster_colors[img_clusters[x,
                                                                           y]]

    # display digits
    fig, axs = plt.subplots(1, 1, figsize=(2, 2))

    axs.imshow(img_cluster_colors, interpolation='none')
    axs.set_xticks([0, 27])
    axs.set_xticklabels([0, 27])
    axs.set_yticks([0, 27])
    axs.set_yticklabels([0, 27])

    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'plot-img-clusters.pdf'), format='pdf')
    plt.imsave(os.path.join(directory, 'plot-img-clusters.png'),
               img_cluster_colors)
Exemple #4
0
def silhouette_kcluster(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_kcluster', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    X /= 255.

    pca = PCA(n_components=50, whiten=False, random_state=42).fit(X)

    # PCA preprocessed
    X_pca = pca.transform(X)

    k = [10, 15, 20, 25, 30, 35, 50, 100, 200]

    dict_results = {
        'n_clusters': [],
        'pca_n_components': [],
        'pca_expl_var': [],
        'pca_expl_var_ratio': [],
        'silhouette_kcosine': [],
        'silhouette_kmeans': [],
        'fittime_kcosine': [],
        'fittime_kmeans': []
    }

    for n_clusters in k:
        dict_results['n_clusters'].append(n_clusters)
        dict_results['pca_n_components'].append(pca.n_components_)
        dict_results['pca_expl_var'].append(np.sum(pca.explained_variance_))
        dict_results['pca_expl_var_ratio'].append(
            np.sum(pca.explained_variance_ratio_))

        # kmeans
        clusterer_euclid = KMeans(n_clusters=n_clusters, random_state=42)
        t = time.time()
        clusterer_euclid.fit(X_pca)
        dict_results['fittime_kmeans'].append(time.time() - t)
        dict_results['silhouette_kmeans'].append(
            silhouette_score(X,
                             clusterer_euclid.predict(X_pca),
                             metric='euclidean',
                             random_state=42))

        # kcosine
        clusterer_cosine = KMeans(n_clusters=n_clusters, random_state=42)
        t = time.time()
        clusterer_cosine.fit(X_pca)
        dict_results['fittime_kcosine'].append(time.time() - t)
        dict_results['silhouette_kcosine'].append(
            silhouette_score(X,
                             clusterer_cosine.predict(X_pca),
                             metric='cosine',
                             random_state=42))

    # save results to csv
    with open(os.path.join(directory, 'silhouette_kcluster.csv'), 'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
Exemple #5
0
def plot_var(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    scaler = StandardScaler().fit(X)
    pos = range(0, 28)
    meanX = []
    varX = []

    fig, axs = plt.subplots(1,
                            2,
                            figsize=(5, 2),
                            gridspec_kw={'width_ratios': [1, 1.4]})

    example = np.zeros((28, 28, 3))
    example[..., 0] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # red
    example[..., 1] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # green
    example[..., 2] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # blue

    for idx in pos:
        example[idx, idx, :] = tud_colors['orange'][:-1]
        meanX.append(scaler.mean_[idx * 28 + idx])
        varX.append(scaler.var_[idx * 28 + idx])

    axs[0].imshow(example, interpolation='none')

    line_var, = axs[1].plot(pos, varX, color=tud_colors['orange'])
    ax_mean = axs[1].twinx()
    line_mean, = ax_mean.plot(pos, meanX, color=tud_colors['lightblue'])

    axs[1].legend((line_var, line_mean), (r'$\sigma^2$', r'$\mu$'),
                  bbox_to_anchor=(1.2, .5),
                  loc="center left")

    # fig.suptitle('Feature distribution in MNIST picture')
    axs[0].set_xticks([0, 27])
    axs[0].set_xticklabels([0, 27])
    axs[0].set_yticks([0, 27])
    axs[0].set_yticklabels([0, 27])

    axs[1].set_xlim([0, 27])
    axs[1].set_xlabel('position')
    axs[1].set_ylabel(r'$\sigma^2$', labelpad=-15, loc='top', rotation=0)
    y_ticks = [0, 2000, 4000, 6000, 8000, 10000, 12000]
    axs[1].set_yticks(y_ticks)
    axs[1].set_yticklabels(
        ['{0:0.0f}k'.format(y_tick / 1000) for y_tick in y_ticks])
    # axs[1].tick_params(axis='x', labelrotation=90)
    ax_mean.set_ylabel(r'$\mu$', labelpad=-5, loc='top', rotation=0)
    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-pixel-variance.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             'mnist-pixel-variance.pgf'),
                format='pgf')
    # plt.show()
    return
Exemple #6
0
def elm_bip(directory):
    self_name = 'elm_bip'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # preprocessing
    X /= 255.
    pca = PCA(n_components=50).fit(X)
    X_preprocessed = pca.transform(X)
    logger.info('{0} features remaining after preprocessing.'.format(
        X_preprocessed.shape[1]))

    # prepare parameter grid
    param_grid = [{
        'hidden_layer_size': [500, 1000, 2000, 4000],
        'activation': ['tanh'],
        'alpha': [1e-5],
        'random_state': [42]
    }]

    # setup estimator
    estimator = ELMClassifier(input_to_node=BatchIntrinsicPlasticity(),
                              regressor=Ridge())

    # setup grid search
    cv = GridSearchCV(estimator=estimator,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=1,
                      verbose=2,
                      refit=False,
                      cv=[(np.arange(0,
                                     train_size), np.arange(train_size,
                                                            70000))])

    # run!
    cv.fit(X, y_encoded)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    # refine results
    cv_results = cv.cv_results_
    del cv_results['params']

    # save results
    try:
        with open(os.path.join(directory, '{0}.csv'.format(self_name)),
                  'w') as f:
            f.write(','.join(cv_results.keys()) + '\n')
            for row in list(map(list, zip(*cv_results.values()))):
                f.write(','.join(map(str, row)) + '\n')
    except PermissionError as e:
        print('Missing privileges: {0}'.format(e))
Exemple #7
0
def plot_image_min_var(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    scaler = StandardScaler().fit(X)

    image_size = (28, 28, 3)
    example = np.zeros(X[example_image_idx, ...].shape + (3, ))
    for rgb_idx in range(3):
        example[..., rgb_idx] = 1. - X[example_image_idx, :] / 255.

    p1_1 = 1 / 10 * 1 / 10
    p1_2 = 1 / 10
    var_p1_1 = 255**2 * p1_1 * (1 - p1_1)
    var_p1_2 = 255**2 * p1_2 * (1 - p1_2)

    example_min_var_p1_1 = np.copy(example)
    example_min_var_p1_1[scaler.var_ < var_p1_1,
                         ...] = tud_colors['orange'][:-1]

    example_min_var_p1_2 = np.copy(example)
    example_min_var_p1_2[scaler.var_ < var_p1_2,
                         ...] = tud_colors['orange'][:-1]

    fig, axs = plt.subplots(1, 3, figsize=(5, 2))

    axs[0].imshow(np.reshape(example, image_size), interpolation='none')
    axs[0].set_title('$p_1$=0\noriginal\n$n$={0:d}'.format(len(scaler.var_)))
    axs[0].set_xticks([0, 27])
    axs[0].set_xticklabels([0, 27])
    axs[0].set_yticks([0, 27])
    axs[0].set_yticklabels([0, 27])

    axs[1].imshow(np.reshape(example_min_var_p1_1, image_size),
                  interpolation='none')
    axs[1].set_title(
        r'$p_1$={1:0.2f}\n$\sigma^2$ > {0:0.0f}\n$n$={2:d}'.format(
            var_p1_1, p1_1, np.sum(scaler.var_ > var_p1_1)))
    axs[1].set_xticks([0, 27])
    axs[1].set_xticklabels([0, 27])
    axs[1].set_yticks([0, 27])
    axs[1].set_yticklabels([0, 27])

    axs[2].imshow(np.reshape(example_min_var_p1_2, image_size),
                  interpolation='none')
    axs[2].set_title(
        r'$p_1$={1:0.2f}\n$\sigma^2$ > {0:0.0f}\n$n$={2:d}'.format(
            var_p1_2, p1_2, np.sum(scaler.var_ > var_p1_2)))
    axs[2].set_xticks([0, 27])
    axs[2].set_xticklabels([0, 27])
    axs[2].set_yticks([0, 27])
    axs[2].set_yticklabels([0, 27])

    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-img-min-var.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-img-min-var.pgf'),
                format='pgf')
    # plt.show()
    return
Exemple #8
0
def plot_poster(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    X /= 255.

    # scale for imsave
    def scale01(X):
        return (X - np.min(X)) / (np.max(X) - np.min(X))

    # preprocessing
    pca = PCA(n_components=50).fit(X)
    clusterer = KMeans(n_clusters=20).fit(X[:10000, :])

    # save images
    # example
    plt.imsave(os.path.join(os.environ['IMGPATH'], 'example-mnist.png'),
               X[example_image_idx, :].reshape(28, 28),
               cmap=plt.cm.gray_r,
               format='png')

    # pca component
    pca_component = scale01(pca.components_[2, :]).reshape(28, 28)
    pca_example = scale01(
        np.matmul(X[example_image_idx, :].reshape(1, -1),
                  np.matmul(pca.components_.T,
                            pca.components_))).reshape(28, 28)

    plt.imsave(os.path.join(os.environ['IMGPATH'], 'pca-component3.png'),
               pca_component,
               cmap=plt.cm.gray_r,
               format='png')
    plt.imsave(os.path.join(os.environ['IMGPATH'], 'pca50-mnist.png'),
               pca_example,
               cmap=plt.cm.gray_r,
               format='png')

    # kmeans centroids
    for idx in [0, 4, 9, 14, 19]:
        kmeans_centroid = scale01(clusterer.cluster_centers_[idx,
                                                             ...]).reshape(
                                                                 28, 28)
        plt.imsave(os.path.join(os.environ['IMGPATH'],
                                'kmeans-centroid{0}.png'.format(idx)),
                   kmeans_centroid,
                   cmap=plt.cm.gray_r,
                   format='png')

    # input weights
    T = np.load(os.path.join(os.environ['DATAPATH'],
                             'pca50+kmeans200_matrix.npy'),
                allow_pickle=True)
    for idx in [0, 49, 99, 149, 199]:
        input_weight = scale01(T[:, idx]).reshape(28, 28)
        plt.imsave(os.path.join(os.environ['IMGPATH'],
                                'input-weight{0}.png'.format(idx)),
                   input_weight,
                   cmap=plt.cm.gray_r,
                   format='png')
Exemple #9
0
def plot_imbalance(directory):
    self_name = 'plot_imbalance'
    logger = new_logger(self_name, directory)
    X, y = get_mnist(directory)
    logger.info('successfully fetched {0} datapoints'.format(X.shape[0]))

    tp_y_unique = np.unique(y.astype(int), return_counts=True)
    y_unique = tp_y_unique[0][np.argsort(tp_y_unique[0])]
    y_counts = tp_y_unique[1][np.argsort(tp_y_unique[0])]

    # y_hist_arr = np.array(y_hist, dtype=float)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6., 2.1))

    for idx in range(y_counts.size):
        plt.text(idx * 1.,
                 3500,
                 '{0:.1f}%'.format(y_counts[idx] / np.sum(y_counts) * 100),
                 color=(1., 1., 1., .2),
                 fontsize='small',
                 horizontalalignment='center')
        # w = bar.get_with()
        # plt.text(bar.get_x() - .04, bar.get_y() + .1, '{0:.1f}%'.format())

    ax.set_xlim([-.5, 9.5])
    ax.set_xticks(y_unique)
    ax.set_xticklabels(['{0:.0f}'.format(idx) for idx in y_unique])
    ax.set_xlabel('label')

    ax.set_ylim([0, 8000])
    ax.set_yticks([7000], minor=True)
    ax.grid(which='minor',
            axis='y',
            alpha=.7,
            linestyle='--',
            color=tud_colors['lightgreen'])
    ax.set_ylabel(r'\#occurrences')

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='x', which='both', bottom=False, top=False)

    ax.legend(bbox_to_anchor=(1, .5), loc='center left')
    fig.tight_layout()
    # fig.patch.set_visible(False)

    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             '{0}.pgf'.format(self_name)),
                format='pgf')
    fig.savefig(os.path.join(directory, '{0}.pdf'.format(self_name)),
                format='pdf')
    return
Exemple #10
0
def main(out_path=os.path.join(os.getcwd(), 'preprocessing-mnist'),
         function_name='labels'):
    if not os.path.exists(out_path):
        try:
            os.makedirs(out_path)
        except OSError as error:
            print(error)

    # quick and dirty
    # directory = os.path.join(os.getcwd(), 'preprocessing-mnist')
    directory = out_path

    logger = new_logger('main')
    logger.info('{0} called, entering main'.format(__file__))

    runtime = [time.time()]

    # fetch data
    X, y = get_mnist()

    runtime.append(time.time())
    logger.info('fetch: {0} s'.format(np.diff(runtime[-2:])))
    logger.info('X.shape = {0}, y.shape = {1}'.format(X.shape, y.shape))

    function_dict = {
        'labels': plot_labels,
        'plot_pooling': plot_pooling,
        'plot_poster': plot_poster,
        'histogram': plot_historgram,
        'var': plot_var,
        'normalized': plot_normalized,
        'variance_mean': plot_variance_mean,
        'image_min_var': plot_image_min_var,
        'plot_pca': plot_pca,
        'plot_covariance': plot_covariance,
        'plot_imbalance': plot_imbalance,
        'plot_img_cluster': plot_img_cluster,
    }

    if function_name in function_dict:
        function_dict[function_name](directory)
    else:
        logger.warning('no function {0} found'.format(function_name))

    logger.info('{0} finished, return from main'.format(__file__))
Exemple #11
0
def plot_normalized(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    X = X / 4 + 100
    X_picture_normalization = StandardScaler().fit_transform(X.T).T
    X_feature_normalization = StandardScaler().fit_transform(X)

    fig, axs = plt.subplots(1, 3, figsize=(5, 2))

    img_idx = example_image_idx

    axs[0].imshow(np.resize(X[img_idx, :], (28, 28)).astype(int),
                  interpolation='none',
                  cmap=plt.cm.gray_r,
                  norm=Normalize(vmin=0, vmax=255, clip=True))
    axs[0].set_title('low contrast')
    axs[0].set_xticks([0, 27])
    axs[0].set_xticklabels([0, 27])
    axs[0].set_yticks([0, 27])
    axs[0].set_yticklabels([0, 27])

    axs[1].imshow(np.resize(X_picture_normalization[img_idx, :], (28, 28)),
                  interpolation='none',
                  cmap=plt.cm.gray_r)
    axs[1].set_title('picture\nnormalization')
    axs[1].set_xticks([0, 27])
    axs[1].set_xticklabels([0, 27])
    axs[1].set_yticks([0, 27])
    axs[1].set_yticklabels([0, 27])

    axs[2].imshow(np.resize(X_feature_normalization[img_idx, :], (28, 28)),
                  interpolation='none',
                  cmap=plt.cm.gray_r)
    axs[2].set_title('feature\nnormalization')
    axs[2].set_xticks([0, 27])
    axs[2].set_xticklabels([0, 27])
    axs[2].set_yticks([0, 27])
    axs[2].set_yticklabels([0, 27])

    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-normalized.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-normalized.pgf'),
                format='pgf')
    # plt.show()
    return
Exemple #12
0
def plot_pooling(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    img = X[example_image_idx, :].reshape((28, 28))
    kernel_size = (2, 1)
    img_pooled = np.zeros((int(np.ceil(img.shape[0] / kernel_size[0])),
                           int(np.ceil(img.shape[1] / kernel_size[1]))))

    for x in range(img_pooled.shape[0]):
        for y in range(img_pooled.shape[1]):
            x_min = x * kernel_size[0]
            x_max = x_min + kernel_size[0]
            y_min = y * kernel_size[1]
            y_max = y_min + kernel_size[1]
            img_pooled[x, y] = np.max(img[x_min:x_max, y_min:y_max])

    plt.imsave(os.path.join(
        directory,
        'pooled_max_kernel{0}x{1}.png'.format(kernel_size[0], kernel_size[1])),
               img_pooled,
               cmap=plt.cm.gray_r)
    return
Exemple #13
0
def plot_covariance(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    cov = np.cov((X - np.mean(X, axis=0)).T)
    cov_w, cov_v = np.linalg.eigh(cov)
    # cov_pca_comp = cov_v.T

    n_components = 784

    cov_PCA_alternative = np.flip(np.cov(np.matmul(cov_v.T, X.T)), axis=(0, 1))
    cov_v_ordered = np.flip(cov_v, axis=(0, 1))
    # plt.imsave(os.path.join(directory, 'mnist-cov-pca-alt-db.png'),
    #            20 * np.log10(np.abs(cov_PCA_alternative) + 1.),
    #            cmap=plt.cm.gray_r)

    # pca = PCA().fit(X)
    cov_PCA = cov_PCA_alternative

    fig, axs = plt.subplots(1, 3, figsize=(6, 2.5))

    if isinstance(axs, plt.Axes):
        axs = [axs]

    axs[0].imshow(np.resize(cov, (X.shape[1], X.shape[1])),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[0].set_title('covariance')
    axs[0].set_xticks(np.arange(start=0, stop=785, step=28))
    axs[0].set_xticklabels('{0:.0f}'.format(x) if x in [0, 784] else ''
                           for x in np.arange(start=0, stop=785, step=28))
    axs[0].set_yticks(np.arange(start=0, stop=785, step=28))
    axs[0].set_yticklabels('{0:.0f}'.format(x) if x in [0, 784] else ''
                           for x in np.arange(start=0, stop=785, step=28))

    axs[1].imshow(np.resize(20 * np.log10(np.abs(cov_PCA) + 1.),
                            (n_components, n_components)),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[1].set_title('after PCA ({0})'.format(n_components))
    axs[1].set_xticks(
        np.append(np.arange(start=0, stop=n_components, step=28),
                  n_components))
    axs[1].set_xticklabels(
        np.append([
            '{0:.0f}'.format(x) if x == 0 else ''
            for x in np.arange(start=0, stop=n_components, step=28)
        ], '{0}'.format(n_components)))
    axs[1].set_yticks(
        np.append(np.arange(start=0, stop=n_components, step=28),
                  n_components))
    axs[1].set_yticklabels(
        np.append([
            '{0:.0f}'.format(x) if x == 0 else ''
            for x in np.arange(start=0, stop=n_components, step=28)
        ], '{0}'.format(n_components)))

    axs[2].imshow(20 * np.log10(np.abs(cov_v_ordered.T) + 1.),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[2].set_title('PCA components')
    axs[2].set_xticks(np.arange(start=0, stop=785, step=28))
    axs[2].set_xticklabels('{0:.0f}'.format(x) if x in [0, 784] else ''
                           for x in np.arange(start=0, stop=785, step=28))
    axs[2].set_yticks(np.arange(start=0, stop=785, step=28))
    axs[2].set_yticklabels('{0:.0f}'.format(x) if x in [0, 784] else ''
                           for x in np.arange(start=0, stop=785, step=28))

    def scale(A):
        return (A - np.min(A)) / (np.max(A) - np.min(A))

    for idx in [0, 1, 2, 3, 4, 5, 20, 50, 100, 200, 400, 600, 701, 783]:
        filepath = os.path.join(
            directory, '{0}{1}.png'.format('mnist-covariance-eig', idx))
        plt.imsave(filepath,
                   cov_v_ordered.T[idx, ...].reshape(28, 28),
                   cmap=plt.cm.gray_r)

    fig.tight_layout()
    # fig.show()
    fig.savefig(os.path.join(directory, 'mnist-covariance.pdf'), format='pdf')
    fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-covariance.pgf'),
                format='pgf')
    plt.imsave(os.path.join(directory, 'mnist-covariance.png'),
               np.resize(cov, (X.shape[1], X.shape[1])),
               cmap=plt.cm.gray_r)
    plt.imsave(os.path.join(directory, 'mnist-covariance-pca.png'),
               np.resize(cov_PCA, (n_components, n_components)),
               cmap=plt.cm.gray_r)
    plt.imsave(os.path.join(directory, 'mnist-pca-components.png'),
               cov_v_ordered.T,
               cmap=plt.cm.gray_r)
    plt.imsave(os.path.join(directory, 'mnist-covariance-db.png'),
               np.resize(20 * np.log10(np.abs(cov) + 1.),
                         (X.shape[1], X.shape[1])),
               cmap=plt.cm.gray_r)
    plt.imsave(os.path.join(directory, 'mnist-covariance-pca-db.png'),
               np.resize(20 * np.log10(np.abs(cov_PCA) + 1.),
                         (n_components, n_components)),
               cmap=plt.cm.gray_r)
    plt.imsave(os.path.join(directory, 'mnist-pca-components-db.png'),
               20 * np.log10(np.abs(cov_v_ordered.T) + 1.),
               cmap=plt.cm.gray_r,
               vmax=.5)
    return
Exemple #14
0
def silhouette_features(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_features', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    X /= 255.

    X = X[:10000, ...]

    scaler = StandardScaler().fit(X)
    pca = PCA(whiten=False, random_state=42).fit(X)

    X_pca = pca.transform(X)

    # sort scaler variances
    variance_indices = np.argsort(scaler.var_)[::-1]

    n_features_list = [
        1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70,
        80, 90, 100, 200, 300, 400, 500, 600, 700, 784
    ]

    rs = np.random.RandomState(42)

    k = 20

    dict_results = {
        'nfeatures': [],
        'fittime_random': [],
        'fittime_maxvar': [],
        'fittime_pca': [],
        'silhouette_random': [],
        'silhouette_maxvar': [],
        'silhouette_pca': [],
        'explainvar_random': [],
        'explainvar_maxvar': [],
        'explainvar_pca': [],
        'explvarrat_random': [],
        'explvarrat_maxvar': [],
        'explvarrat_pca': [],
        'n_clusters': [],
    }

    for n_features in n_features_list:
        clusterer = KMeans(n_clusters=k, random_state=42)
        dict_results['nfeatures'].append(n_features)
        dict_results['n_clusters'].append(clusterer.n_clusters)

        indices = rs.choice(X.shape[1], size=n_features)
        t = time.time()
        pred = clusterer.fit_predict(X[:, indices])
        dict_results['fittime_random'].append(time.time() - t)
        dict_results['silhouette_random'].append(
            silhouette_score(X, pred, metric='euclidean', random_state=42))
        dict_results['explainvar_random'].append(np.sum(scaler.var_[indices]))
        dict_results['explvarrat_random'].append(
            np.sum(scaler.var_[indices]) / np.sum(scaler.var_))

        t = time.time()
        indices = variance_indices[:n_features]
        pred = clusterer.fit_predict(X[:, indices])
        dict_results['fittime_maxvar'].append(time.time() - t)
        dict_results['silhouette_maxvar'].append(
            silhouette_score(X, pred, metric='euclidean', random_state=42))
        dict_results['explainvar_maxvar'].append(np.sum(scaler.var_[indices]))
        dict_results['explvarrat_maxvar'].append(
            np.sum(scaler.var_[indices]) / np.sum(scaler.var_))

        t = time.time()
        pred = clusterer.fit_predict(X_pca[:, :n_features])
        dict_results['fittime_pca'].append(time.time() - t)
        dict_results['silhouette_pca'].append(
            silhouette_score(X, pred, metric='euclidean', random_state=42))
        dict_results['explainvar_pca'].append(
            np.sum(pca.explained_variance_[:n_features]))
        dict_results['explvarrat_pca'].append(
            np.sum(pca.explained_variance_ratio_[:n_features]))

        logger.info('pca silhouette at n_features={1:.0f}: {0}'.format(
            dict_results['silhouette_pca'][-1], n_features))

    # save results to csv
    with open(
            os.path.join(directory,
                         'silhouette_kmeans{0:.0f}_features.csv'.format(k)),
            'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
Exemple #15
0
def plot_pca(directory, *args, **kwargs):
    X, y = get_mnist(directory)

    fig, axs = plt.subplots(1,
                            5,
                            figsize=(6, 1.5),
                            gridspec_kw={
                                'wspace': 0.45,
                                'left': .05,
                                'right': .95,
                                'bottom': .0,
                                'top': .90
                            })

    # automatic pca
    decomposer = PCA(whiten=False).fit(X)

    # original
    axs[0].imshow(np.resize(X[example_image_idx, ...], (28, 28)),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[0].set_title('original')

    # mean
    axs[1].imshow(np.resize(decomposer.mean_, (28, 28)),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[1].set_title('average: {0}'.format(100))

    # pca 50, average free
    X_avgfree = X - np.mean(X, axis=0)
    M_pca = decomposer.components_[:50, :].T
    M = np.dot(M_pca, M_pca.T)  # transformation and inverse combined

    axs[2].imshow(np.resize(np.dot(X_avgfree[example_image_idx, ...], M),
                            (28, 28)),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[2].set_title('n={0}\naverage free'.format(M_pca.shape[1]))

    # pca 50, not average free
    axs[3].imshow(np.resize(np.dot(X[example_image_idx, ...], M), (28, 28)),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[3].set_title('n={0}\nwith average'.format(M_pca.shape[1]))

    # pca 25, not average free
    M_pca = decomposer.components_[:25, :].T
    M = np.dot(M_pca, M_pca.T)  # transformation and inverse combined

    axs[4].imshow(np.resize(np.dot(X[example_image_idx, ...], M), (28, 28)),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[4].set_title('n={0}\nwith average'.format(M_pca.shape[1]))

    for idx in range(5):
        axs[idx].set_xticks([0, 27])
        axs[idx].set_xticklabels([0, 27])
        axs[idx].set_yticks([0, 27])
        axs[idx].set_yticklabels([0, 27])

    # fig.tight_layout()
    # fig.show()
    fig.savefig(os.path.join(directory, 'mnist-pca-effects.pdf'), format='pdf')
    fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pca-effects.pgf'),
                format='pgf')
    return
Exemple #16
0
def elm_pca(directory):
    self_name = 'elm_pca'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    # scale X
    X /= 255.

    # split train test
    X_train, X_test, y_train, y_test = train_test_split(X[:train_size],
                                                        y[:train_size],
                                                        train_size=50000,
                                                        random_state=42)

    # prepare parameter grids
    param_grid_basic = {
        'hidden_layer_size': 2000,
        'input_scaling': 1.,
        'bias_scaling': 0.,
        'input_activation': 'relu',
        'alpha': 1e-5,
        'random_state': 42
    }

    # setup estimator
    estimator = ELMClassifier(regressor=Ridge())

    # initialize filepath
    filepath = os.path.join(directory, '{0}_basic.csv'.format(self_name))

    # initialize param dict
    param_dict_job = estimator.get_params().copy()
    param_dict_job.update(param_grid_basic)

    # initialize results dict
    results_dict_job = param_dict_job.copy()
    # add dummy results
    results_dict_job.update({
        'time_fit': 0,
        'time_pred': 0,
        'score': 0,
        'pca_n_components': 0
    })

    # preprocessing pca
    try:
        # write header
        with open(filepath, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
            writer.writeheader()

        for pca_n_components in [10, 20, 50, 100, 200, 500, 784]:
            results_dict_job.update({'pca_n_components': pca_n_components})
            estimator.set_params(**param_dict_job)

            # preprocessing
            pca = PCA(n_components=pca_n_components).fit(X_train)
            X_train_pca, X_test_pca = \
                pca.transform(X_train), pca.transform(X_test)

            # run!
            time_start = time.time()
            estimator.fit(X_train_pca, y_train)
            time_fit = time.time()
            y_pred = estimator.predict(X_test_pca)
            time_pred = time.time()
            # run end!

            results_dict_job.update({
                'time_fit': time_fit - time_start,
                'time_pred': time_pred - time_fit,
                'score': accuracy_score(y_test, y_pred)
            })

            logger.info('pca.n_components_: {0}, score: {1}'.format(
                pca_n_components, results_dict_job['score']))

            with open(filepath, 'a') as f:
                writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
                writer.writerow(results_dict_job)
    except MemoryError as e:
        logger.error('Memory error: {0}'.format(e))
    except PermissionError as e:
        logger.error('Missing privileges: {0}'.format(e))
    except Exception as e:
        logger.error('Unexpected exception: {0}'.format(e))
Exemple #17
0
def elm_preprocessed(directory):
    self_name = 'elm_preprocessed'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # preprocessing
    X /= 255.
    pca = PCA(n_components=50).fit(X)
    X_preprocessed = pca.transform(X)
    logger.info('{0} features remaining after preprocessing.'.format(
        X_preprocessed.shape[1]))

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed,
                                                        y_encoded,
                                                        train_size=train_size,
                                                        random_state=42)

    # prepare parameter grid
    param_grid = [{
        'hidden_layer_size': [500, 2000],
        'input_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'bias_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'input_activation': ['relu'],
        'alpha': [1e-5],
        'random_state': [42]
    }, {
        'hidden_layer_size': [2000],
        'input_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'bias_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'input_activation': ['tanh'],
        'alpha': [1e-5],
        'random_state': [42]
    }]

    # setup estimator
    estimator = ELMClassifier(regressor=Ridge())

    # setup grid search
    cv = GridSearchCV(estimator=estimator,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=1,
                      verbose=2,
                      refit=False,
                      cv=StratifiedShuffleSplit(n_splits=1,
                                                test_size=1 / 7,
                                                random_state=42))

    # run!
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    # refine results
    cv_results = cv.cv_results_
    del cv_results['params']

    # save results
    try:
        with open(os.path.join(directory, 'elm_preprocessed.csv'), 'w') as f:
            f.write(','.join(cv_results.keys()) + '\n')
            for row in list(map(list, zip(*cv_results.values()))):
                f.write(','.join(map(str, row)) + '\n')
    except PermissionError as e:
        print('Missing privileges: {0}'.format(e))
Exemple #18
0
def silhouette_n_clusters(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_n_clusters', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    scaler = StandardScaler().fit(X)
    X /= 255.

    pca = PCA(n_components=50, whiten=False, random_state=42).fit(X)
    min_var = 3088.6875

    # reduce train size
    # X = X[:10000, ...]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y_encoded,
                                                        train_size=10000,
                                                        random_state=42)

    # variance threshold
    X_var_threshold = X_train[..., scaler.var_ > min_var]

    # pca
    X_pca = pca.transform(X_train)

    # n_clusters
    k = [
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 40,
        50, 60, 70, 80, 90, 100, 200, 500, 1000, 2000, 4000
    ]

    # n_init
    n_init = 10

    dict_results = {
        'n_clusters': [],
        'n_init': [],
        'variance_threshold': [],
        'pca_n_components': [],
        'pca_explained_variance': [],
        'pca_explained_variance_ratio': [],
        'silhouette_original': [],
        'silhouette_variance_threshold': [],
        'silhouette_pca': [],
        'fittime_original': [],
        'fittime_variance_threshold': [],
        'fittime_pca': [],
        'inertia_original': [],
        'inertia_variance_threshold': [],
        'inertia_pca': [],
        'n_iter_original': [],
        'n_iter_variance_threshold': [],
        'n_iter_pca': []
    }

    for n_clusters in k:
        dict_results['n_clusters'].append(n_clusters)
        dict_results['n_init'].append(n_init)
        dict_results['variance_threshold'].append(min_var)
        dict_results['pca_n_components'].append(pca.n_components_)
        dict_results['pca_explained_variance'].append(
            np.sum(pca.explained_variance_))
        dict_results['pca_explained_variance_ratio'].append(
            np.sum(pca.explained_variance_ratio_))

        clusterer = MiniBatchKMeans(n_clusters=n_clusters,
                                    init='k-means++',
                                    n_init=n_init,
                                    random_state=42)

        # original
        t = time.time()
        clusterer.fit(X_train)
        dict_results['fittime_original'].append(time.time() - t)
        dict_results['inertia_original'].append(clusterer.inertia_)
        dict_results['n_iter_original'].append(clusterer.n_iter_)
        dict_results['silhouette_original'].append(
            silhouette_score(X_train,
                             clusterer.predict(X_train),
                             metric='euclidean',
                             random_state=42))

        np.save('./cluster_critical.npy', clusterer.cluster_centers_)

        # var threshold
        t = time.time()
        clusterer.fit(X_var_threshold)
        dict_results['fittime_variance_threshold'].append(time.time() - t)
        dict_results['inertia_variance_threshold'].append(clusterer.inertia_)
        dict_results['n_iter_variance_threshold'].append(clusterer.n_iter_)
        dict_results['silhouette_variance_threshold'].append(
            silhouette_score(X_train,
                             clusterer.predict(X_var_threshold),
                             metric='euclidean',
                             random_state=42))

        # pca
        t = time.time()
        clusterer.fit(X_pca)
        dict_results['fittime_pca'].append(time.time() - t)
        dict_results['inertia_pca'].append(clusterer.inertia_)
        dict_results['n_iter_pca'].append(clusterer.n_iter_)
        dict_results['silhouette_pca'].append(
            silhouette_score(X_train,
                             clusterer.predict(X_pca),
                             metric='euclidean',
                             random_state=42))

        logger.info('n_clusters = {0}, pca kmeans score: {1}'.format(
            n_clusters, dict_results['silhouette_pca'][-1]))
        logger.info('n_clusters = {0}'.format(n_clusters))

    # save results to csv
    with open(os.path.join(directory, 'silhouette_n_clusters.csv'), 'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
Exemple #19
0
def plot_historgram(directory, *args, **kwargs):
    logger = new_logger('plot_historgram', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    fig, axs = plt.subplots(1,
                            2,
                            figsize=(5, 2),
                            gridspec_kw={'width_ratios': [1, 1.7]})

    example = np.zeros((28, 28, 3))
    example[..., 0] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # red
    example[..., 1] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # green
    example[..., 2] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # blue

    idx_fringe = (25, 17)
    idx_center = (13, 12)

    example[idx_center[0], idx_center[1], :] = tud_colors['lightblue'][:-1]
    example[idx_fringe[0], idx_fringe[1], :] = tud_colors['orange'][:-1]

    bins = np.array(range(0, 287, 32)).astype(int)

    hist_fringe, bin_edges = np.histogram(X[:, idx_fringe[0] * 28 +
                                            idx_fringe[1]],
                                          bins=bins)
    hist_center, bin_edges = np.histogram(X[:, idx_center[0] * 28 +
                                            idx_center[1]],
                                          bins=bins)

    logger.info('validation sum hist_fringe: {0}, sum hist_center: {1}'.format(
        np.sum(hist_fringe / 1000), np.sum(hist_center / 1000)))

    axs[0].imshow(example, interpolation='none')
    axs[0].set_xticks([0, 27])
    axs[0].set_xticklabels([0, 27])
    axs[0].set_yticks([0, 27])
    axs[0].set_yticklabels([0, 27])

    axs[1].bar(bins[1:] - 32,
               height=hist_fringe / 1000,
               width=16,
               color=tud_colors['orange'],
               label='fringe',
               align='edge')
    axs[1].bar(bins[1:] - 16,
               height=hist_center / 1000,
               width=16,
               color=tud_colors['lightblue'],
               label='center',
               align='edge')
    axs[1].tick_params(axis='x', labelrotation=90)
    # axs[1].hist([], bins=range(0, 255, 32), color=[tud_colors['orange'],
    #                                                tud_colors['lightblue']],
    #             align='left')

    axs[1].set_xticks(bins)
    # axs[1].legend(bbox_to_anchor=(0, 1, 1, 0), loc="lower left",
    # mode="expand", ncol=2)
    axs[1].legend(bbox_to_anchor=(1.0, .5), loc="center left")

    # fig.suptitle('Feature distribution in MNIST picture')
    axs[1].set_xlabel('value bins')
    axs[1].set_ylabel('probability')
    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-pixel-histogram.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             'mnist-pixel-histogram.pgf'),
                format='pgf')
    # plt.show()
    return
Exemple #20
0
def elm_hidden_layer_size(directory):
    self_name = 'elm_hidden_layer_size'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    # encode y
    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # scale X
    X /= 255.

    # split train test
    X_train, X_test, y_train, y_test = (X[:train_size, :], X[train_size:, :],
                                        y_encoded[:train_size],
                                        y_encoded[train_size:])

    # fan-out from paper
    fan_out = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]

    # prepare parameter grids
    param_grid_basic = {
        'hidden_layer_size': 0,
        'input_scaling': 1.,
        'bias_scaling': 0.,
        'activation': 'relu',
        'chunk_size': 1000,
        'alpha': 1e-5,
        'random_state': 42
    }

    param_grid_pca = {
        'hidden_layer_size': 0,
        'input_scaling': 1.,
        'bias_scaling': 0.,
        'activation': 'relu',
        'chunk_size': 1000,
        'alpha': 1e-5,
        'random_state': 42
    }

    # setup estimator
    estimator = ELMClassifier()

    # basic
    try:
        # initialize filepath
        csv_filepath = os.path.join(directory,
                                    '{0}_basic.csv'.format(self_name))

        # initialize param dict
        param_dict_job = estimator.get_params().copy()
        param_dict_job.update(param_grid_basic)

        # initialize results dict
        results_dict_job = param_dict_job.copy()
        # add dummy results
        results_dict_job.update({'time_fit': 0, 'time_pred': 0, 'score': 0})

        # write header
        with open(csv_filepath, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
            writer.writeheader()

        for hls in 784 * np.array(fan_out):
            param_dict_job.update({'hidden_layer_size': hls})
            estimator.set_params(**param_dict_job)

            # run!
            time_start = time.time()
            estimator.fit(X_train, y_train)
            time_fit = time.time()
            y_pred = estimator.predict(X_test)
            time_pred = time.time()
            # run end!

            results_dict_job.update(estimator.get_params())

            results_dict_job.update({
                'time_fit': time_fit - time_start,
                'time_pred': time_pred - time_fit,
                'score': accuracy_score(y_test, y_pred)
            })

            logger.info('hidden_layer_size: {0}, score: {1}'.format(
                hls, results_dict_job['score']))

            with open(csv_filepath, 'a') as f:
                writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
                writer.writerow(results_dict_job)

            del estimator.input_to_node._hidden_layer_state

            with open(
                    os.path.join(directory,
                                 'elmc_hls{0}_basic.pickle'.format(hls)),
                    'wb') as f:
                pickle.dump(estimator, f)
    except MemoryError as e:
        logger.error('Memory error: {0}'.format(e))
        pass
    except PermissionError as e:
        logger.error('Missing privileges: {0}'.format(e))
        pass

    # preprocessing pca
    try:
        # initialize filepath
        csv_filepath = os.path.join(directory, '{0}_pca.csv'.format(self_name))

        # preprocessing
        pca50 = PCA(n_components=50).fit(X_train)
        X_train_pca50, X_test_pca50 = (pca50.transform(X_train),
                                       pca50.transform(X_test))

        pca100 = PCA(n_components=100).fit(X_train)
        X_train_pca100, X_test_pca100 = (pca100.transform(X_train),
                                         pca100.transform(X_test))

        list_dict_pca = [{
            'n_components': 50,
            'X_train': X_train_pca50,
            'X_test': X_test_pca50
        }, {
            'n_components': 100,
            'X_train': X_train_pca100,
            'X_test': X_test_pca100
        }]
        logger.info('Preprocessing successful!')

        # initialize param dict
        param_dict_job = estimator.get_params().copy()
        param_dict_job.update(param_grid_pca)

        # initialize results dict
        results_dict_job = param_dict_job.copy()
        # add dummy results
        results_dict_job.update({
            'time_fit': 0,
            'time_pred': 0,
            'score': 0,
            'pca_n_components': 0
        })

        # write header
        with open(csv_filepath, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
            writer.writeheader()

        for dict_pca in list_dict_pca:
            results_dict_job.update(
                {'pca_n_components': dict_pca['n_components']})
            for hls in np.concatenate(
                (100 * np.array(fan_out), 784 * np.array(fan_out)), axis=0):
                param_dict_job.update({'hidden_layer_size': hls})
                estimator.set_params(**param_dict_job)

                # run!
                time_start = time.time()
                estimator.fit(dict_pca['X_train'], y_train)
                time_fit = time.time()
                y_pred = estimator.predict(dict_pca['X_test'])
                time_pred = time.time()
                # run end!

                results_dict_job.update(estimator.get_params())

                results_dict_job.update({
                    'time_fit': time_fit - time_start,
                    'time_pred': time_pred - time_fit,
                    'score': accuracy_score(y_test, y_pred)
                })

                logger.info(
                    'n_components: {2}, hidden_layer_size: {0}, score:'
                    ' {1}'.format(hls, results_dict_job['score'],
                                  results_dict_job['pca_n_components']))

                with open(csv_filepath, 'a') as f:
                    writer = csv.DictWriter(f,
                                            fieldnames=results_dict_job.keys())
                    writer.writerow(results_dict_job)

                with open(
                        os.path.join(
                            directory, 'elmc_hls{0}_pca{1}.pickle'.format(
                                hls, results_dict_job['pca_n_components'])),
                        'wb') as f:
                    pickle.dump(estimator, f)
    except MemoryError as e:
        logger.error('Memory error: {0}'.format(e))
        pass
    except PermissionError as e:
        logger.error('Missing privileges: {0}'.format(e))
        pass
Exemple #21
0
def silhouette_subset(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_subset', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    X /= 255.

    pca = PCA(n_components=50, whiten=False, random_state=42)

    # preprocessing
    X_pca = pca.fit_transform(X)

    # define subset sizes
    subset_sizes = [250, 500, 1000, 2000, 4000, 8000, 16000, 32000, 60000]

    # number of centroids
    k_list = [20]

    dict_results = {
        'subset_size': [],
        'k': [],
        'n_init': [],
        'silhouette_raninit': [],
        'silhouette_preinit': [],
        'fittime_raninit': [],
        'fittime_preinit': [],
        'scoretime_raninit': [],
        'scoretime_preinit': []
    }

    for k in k_list:
        # preinit
        # initial training set
        X_train, X_test, y_train, y_test = train_test_split(
            X_pca,
            y,
            random_state=42,
            train_size=subset_sizes[0],
            shuffle=True,
            stratify=y)
        clusterer_init = KMeans(n_clusters=k,
                                random_state=42,
                                init='k-means++',
                                n_init=10).fit(X_train)

        # random inits
        clusterer = KMeans(n_clusters=k, n_init=10, random_state=42)

        for subset_size in subset_sizes:
            # split on subset size
            dict_results['subset_size'].append(subset_size)
            X_train, X_test, y_train, y_test = train_test_split(
                X_pca,
                y,
                random_state=42,
                train_size=subset_size,
                shuffle=True,
                stratify=y)

            # train preinit
            t = time.time()
            clusterer_init = KMeans(n_clusters=k,
                                    random_state=42,
                                    n_init=1,
                                    init=clusterer_init.cluster_centers_)
            clusterer_init.fit_predict(X_train)
            dict_results['fittime_preinit'].append(time.time() - t)

            # score preinit
            t = time.time()
            dict_results['silhouette_preinit'].append(
                silhouette_score(X_train,
                                 clusterer_init.predict(X_train),
                                 metric='euclidean',
                                 random_state=42))
            dict_results['scoretime_preinit'].append(time.time() - t)

            # train randinit
            t = time.time()
            clusterer.fit(X_train)
            dict_results['fittime_raninit'].append(time.time() - t)

            # score raninit
            t = time.time()
            dict_results['silhouette_raninit'].append(
                silhouette_score(X_train,
                                 clusterer.predict(X_train),
                                 metric='euclidean',
                                 random_state=42))
            dict_results['scoretime_raninit'].append(time.time() - t)

            # store results
            dict_results['k'].append(k)
            dict_results['n_init'].append(clusterer.n_init)

            logger.info('silhouette (preinit) at subset size {1}: {0}'.format(
                dict_results['silhouette_preinit'][-1],
                dict_results['subset_size'][-1]))

    # save results to csv
    with open(os.path.join(directory, 'silhouette_kmeans_subset_size.csv'),
              'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
Exemple #22
0
def elm_coates(directory):
    self_name = 'elm_coates'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    filepath_label_encoder = os.path.join(
        directory, 'label_encoder_{0}.pickle'.format(self_name))

    # save label_encoder
    try:
        with open(filepath_label_encoder, 'wb') as f:
            pickle.dump(label_encoder, f)
    except Exception as e:
        logger.error('Unexpected error: {0}'.format(e))
        exit(1)

    # scale X so X in [0, 1]
    X /= 255.

    X_train, X_test, y_train, y_test = (X[:train_size, ...], X[train_size:],
                                        y_encoded[:train_size],
                                        y_encoded[train_size:])

    csv_filepath = os.path.join(directory, '{0}.csv'.format(self_name))

    # read input matrices from files
    list_filepaths = []
    for filepath in glob.glob(
            os.path.join(directory, '*pca*+kmeans*_matrix.npy')):
        logger.info('matrix file found: {0}'.format(filepath))
        list_filepaths.append(filepath)
        filename = os.path.splitext(os.path.basename(filepath))[0]

        est_filepath = os.path.join(directory,
                                    'est_coates-{0}.pickle'.format(filename))
        pred_filpath = os.path.join(
            directory, 'est_coates-{0}-predicted.npz'.format(filename))

        # only if files do not exist yet
        if (not os.path.isfile(csv_filepath)
                or not os.path.isfile(est_filepath)
                or not os.path.isfile(pred_filpath)):
            # setup estimator
            estimator = ELMClassifier(
                input_to_node=PredefinedWeightsInputToNode(
                    predefined_input_weights=np.load(filepath),
                    input_scaling=1.0,
                    bias_scaling=0.0,
                    input_activation='relu',
                    random_state=42),
                chunk_size=1000)
            logger.info('Estimator params: {0}'.format(
                estimator.get_params().keys()))

            # !run
            time_start = time.time()
            estimator.fit(X_train, y_train)
            time_fitted = time.time()
            y_pred = estimator.predict(X_test)
            time_predicted = time.time()
            # !run

            # results
            dict_results = estimator.get_params()
            dict_results.update({
                'filename': filename,
                'fit_time': time_fitted - time_start,
                'score_time': time_predicted - time_fitted,
                'score': accuracy_score(y_test, y_pred)
            })

            # drop data
            dict_results.pop('input_to_nodes__predefined_input_weights')
            dict_results.pop('input_to_nodes')
            dict_results.pop('regressor')

            logger.info('fitted time {1}, score on test set: {0}'.format(
                dict_results['score'], dict_results['fit_time']))

            # save estimator
            try:
                with open(est_filepath, 'wb') as f:
                    pickle.dump(estimator, f)
            except Exception as e:
                logger.error('Unexpected error: {0}'.format(e))
                exit(1)

            # save results
            try:
                if not os.path.isfile(csv_filepath):
                    with open(csv_filepath, 'a') as f:
                        f.write(','.join(dict_results.keys()))
                        f.write('\n')
                        f.write(','.join(
                            [str(item) for item in dict_results.values()]))
                        f.write('\n')
                else:
                    with open(csv_filepath, 'a') as f:
                        f.write(','.join(
                            [str(item) for item in dict_results.values()]))
                        f.write('\n')
            except PermissionError as e:
                print('Missing privileges: {0}'.format(e))

            # save prediction
            np.savez_compressed(pred_filpath,
                                X_test=X_test,
                                y_test=label_encoder.inverse_transform(y_test),
                                y_pred=label_encoder.inverse_transform(y_pred))

    if not list_filepaths:
        logger.warning('no input weights matrices found')
        return
Exemple #23
0
def test_get_mnist() -> None:
    X, y = get_mnist(os.getcwd())
    assert X.shape[0] == 70000
Exemple #24
0
def elm_hyperparameters(directory):
    self_name = 'elm_hyperparameters'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    X = X / 255.

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y_encoded, train_size=train_size, random_state=42, shuffle=True)
    X_train, _, y_train, _ = (X[:train_size, :], X[train_size:, :],
                              y_encoded[:train_size], y_encoded[train_size:])

    param_grid = {
        'hidden_layer_size': [2000],
        'input_scaling': np.logspace(start=-2, stop=2, base=10, num=7),
        'bias_scaling': np.logspace(start=-2, stop=2, base=10, num=7),
        'input_activation': ['tanh'],
        'alpha': [1e-5],
        'random_state': [42]
    }

    estimator = ELMClassifier(regressor=Ridge())
    cv = GridSearchCV(estimator,
                      param_grid,
                      cv=5,
                      n_jobs=-1,
                      scoring='accuracy')
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    cv_results = cv.cv_results_
    del cv_results['params']
    with open(os.path.join(directory, '{0}_scaling.csv'.format(self_name)),
              'w') as f:
        f.write(','.join(cv_results.keys()) + '\n')
        for row in list(map(list, zip(*cv_results.values()))):
            f.write(','.join(map(str, row)) + '\n')

    param_grid = {
        'hidden_layer_size': [500, 1000, 2000, 4000],
        'input_scaling': [cv.best_params_['input_scaling']],
        'bias_scaling': [cv.best_params_['bias_scaling']],
        'input_activation':
        ['tanh', 'relu', 'bounded_relu', 'logistic', 'identity'],
        'alpha': [1e-5],
        'random_state': [42]
    }

    cv = GridSearchCV(estimator,
                      param_grid,
                      cv=5,
                      n_jobs=-1,
                      scoring='accuracy')
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    cv_results = cv.cv_results_
    del cv_results['params']
    with open(os.path.join(directory, '{0}_size.csv'.format(self_name)),
              'w') as f:
        f.write(','.join(cv_results.keys()) + '\n')
        for row in list(map(list, zip(*cv_results.values()))):
            f.write(','.join(map(str, row)) + '\n')

    param_grid = {
        'hidden_layer_size': [cv.best_params_['hidden_layer_size']],
        'input_scaling': [cv.best_params_['input_scaling']],
        'bias_scaling': [cv.best_params_['bias_scaling']],
        'input_activation': [cv.best_params_['input_activation']],
        'alpha': [.00001, .001, .1],
        'random_state': [42]
    }

    cv = GridSearchCV(estimator,
                      param_grid,
                      cv=5,
                      n_jobs=1,
                      scoring='accuracy')
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    cv_results = cv.cv_results_
    del cv_results['params']
    with open(os.path.join(directory, '{0}_alpha.csv'.format(self_name)),
              'w') as f:
        f.write(','.join(cv_results.keys()) + '\n')
        for row in list(map(list, zip(*cv_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
Exemple #25
0
def train_kmeans(directory):
    self_name = 'train_kmeans'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    # scale X, so $X \in [0, 1]$
    X /= 255.

    list_n_components = [50]  # [50, 100]
    # [20, 50, 100, 200, 500, 1000, 2000, 4000, 8000, 16000]
    list_n_clusters = [200]

    for n_components in list_n_components:
        pca = PCA(n_components=n_components, random_state=42).fit(X)
        X_pca = pca.transform(X)
        logger.info('pca{0}: explained variance ratio = {1}'.format(
            n_components, np.sum(pca.explained_variance_ratio_)))

        for n_clusters in list_n_clusters:
            # minibatch kmeans
            kmeans_basename = 'minibatch-pca{0}+kmeans{1}'.format(
                n_components, n_clusters)

            # only if file does not exist
            if not os.path.isfile(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename))):
                clusterer = MiniBatchKMeans(n_clusters=n_clusters,
                                            init='k-means++',
                                            random_state=42,
                                            batch_size=5000,
                                            n_init=5).fit(X_pca)
                np.save(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename)),
                    np.dot(pca.components_.T, clusterer.cluster_centers_.T))

                # assemble pipeline
                p = make_pipeline(pca, clusterer)
                with open(
                        os.path.join(
                            directory,
                            '{0}_pipeline.pickle'.format(kmeans_basename)),
                        'wb') as f:
                    pickle.dump(p, f)

                logger.info(
                    'successfuly trained MiniBatchKMeans'
                    'and saved to npy/pickle {0}'.format(kmeans_basename))

            # original kmeans
            kmeans_basename = 'original-pca{0}+kmeans{1}'.format(
                n_components, n_clusters)

            if n_clusters < 2000 and not os.path.isfile(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename))):
                clusterer = KMeans(n_clusters=n_clusters,
                                   init='k-means++',
                                   random_state=42,
                                   n_init=5).fit(X_pca)
                np.save(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename)),
                    np.dot(pca.components_.T, clusterer.cluster_centers_.T))

                # assemble pipeline
                p = make_pipeline(pca, clusterer)
                with open(
                        os.path.join(
                            directory,
                            '{0}_pipeline.pickle'.format(kmeans_basename)),
                        'wb') as f:
                    pickle.dump(p, f)

                logger.info('successfuly trained KMeans and saved to'
                            'npy/pickle {0}'.format(kmeans_basename))
Exemple #26
0
def main():
    runtime = [time.time()]
    X, y = get_mnist()
    runtime.append(time.time())
    print('fetch: {0} s'.format(np.diff(runtime[-2:])))

    scaler = StandardScaler()
    whitener = PCA(50, random_state=42)

    X /= 255.
    # X, y = X[:1000, :], y[:1000]

    X_preprocessed = whitener.fit_transform(X)
    runtime.append(time.time())
    print('preprocessing: {0} s'.format(np.diff(runtime[-2:])))

    cls = KMeans(n_clusters=10, random_state=42).fit(X_preprocessed)
    runtime.append(time.time())
    print('clustering: {0} s'.format(np.diff(runtime[-2:])))

    samples, values = get_unique(X_preprocessed, y)

    # define norm
    p2norm = lambda x: np.linalg.norm(x, axis=1, ord=2)

    # reconstruct cluster centers
    cluster_centers = whitener.inverse_transform(cls.cluster_centers_)
    cluster_center_norm = p2norm(cluster_centers)

    # normed_samples = (samples.T / np.linalg.norm(samples, axis=1, ord=2)).T
    # calculate distance
    cos_similarity = np.divide(np.dot(samples, cls.cluster_centers_.T),
                               p2norm(samples) * p2norm(cls.cluster_centers_))

    runtime.append(time.time())
    print('calculations: {0} s'.format(np.diff(runtime[-2:])))

    # display digits
    fig = plt.figure(figsize=(6, 3))
    gs_cyphers = gridspec.GridSpec(2,
                                   5,
                                   figure=fig,
                                   wspace=.4,
                                   hspace=.3,
                                   top=.97,
                                   bottom=.1,
                                   left=.07,
                                   right=.95)

    for i in range(10):
        gs_cypher = gridspec.GridSpecFromSubplotSpec(
            2,
            1,
            subplot_spec=gs_cyphers[i],
            height_ratios=[1., .6],
            hspace=.05)

        ax_centroid = fig.add_subplot(gs_cypher[0,
                                                0])  # axs[(i // 5) * 2, i % 5]
        ax_barchart = fig.add_subplot(
            gs_cypher[1, 0])  # axs[(i // 5) * 2 + 1, i % 5]

        ax_centroid.imshow(cluster_centers[i, :].reshape(28, 28),
                           interpolation='none')
        ax_centroid.tick_params(left=False,
                                bottom=False,
                                labelleft=False,
                                labelbottom=False)

        ax_barchart.bar(list(map(int, values)),
                        cos_similarity[:, i],
                        tick_label=values,
                        color=tud_colors['lightblue'])
        # ax_barchart.set_xlim([0, 9])
        ax_barchart.grid(which='both', axis='y')
        ax_barchart.set_yticks([-1., 0., 1.], minor=False)
        ax_barchart.set_yticks([-.5, .5], minor=True)
        ax_barchart.set_ylim([-1., 1.])

    # plt.tight_layout()
    plt.savefig(
        'mnist-kmeans-centroids-cos-similarity-pca50.pdf')  # plt.show()
    # plt.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-pca50-kmeans-centroids-cos-similarity.pgf'), format='pgf')
    runtime.append(time.time())
    print('plotting: {0} s'.format(np.diff(runtime[-2:])))
Exemple #27
0
def elm_coates_stacked(directory):
    self_name = 'elm_coates_stacked'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # scale X so X in [0, 1]
    X /= 255.

    # setup parameter grid
    param_grid = {
        'chunk_size': [10000],
        'input_scaling': np.logspace(start=-3, stop=1, base=10, num=3),
        'bias_scaling': [0.],  # np.logspace(start=-3, stop=1, base=10, num=6),
        'input_activation': ['relu'],
        'alpha': [1e-5],
        'random_state': [42]
    }

    # read input matrices from files
    list_filepaths = []
    predefined_input_weights = np.empty((784, 0))
    for filepath in glob.glob(os.path.join(directory, '*kmeans1*matrix.npy')):
        logger.info('matrix file found: {0}'.format(filepath))
        list_filepaths.append(filepath)
        predefined_input_weights = np.append(predefined_input_weights,
                                             np.load(filepath),
                                             axis=1)

    # setup estimator
    estimator = ELMClassifier(
        PredefinedWeightsInputToNode(
            predefined_input_weights=predefined_input_weights),
        IncrementalRegression())
    logger.info('Estimator params: {0}'.format(estimator.get_params().keys()))
    # return

    # setup grid search
    cv = GridSearchCV(estimator=estimator,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=1,
                      verbose=1,
                      cv=[(np.arange(0,
                                     train_size), np.arange(train_size,
                                                            70000))])

    # run!
    cv.fit(X, y_encoded)
    cv_best_params = cv.best_params_
    del cv_best_params['input_to_nodes__predefined_input_weights']

    # refine best params
    logger.info('best parameters: {0} (score: {1})'.format(
        cv_best_params, cv.best_score_))

    # refine results
    cv_results = cv.cv_results_
    del cv_results['params']
    del cv_results['param_input_to_nodes__predefined_input_weights']

    # save results
    try:
        with open(os.path.join(directory, '{0}.csv'.format(self_name)),
                  'w') as f:
            f.write(','.join(cv_results.keys()) + '\n')
            for row in list(map(list, zip(*cv_results.values()))):
                f.write(','.join(map(str, row)) + '\n')
    except PermissionError as e:
        print('Missing privileges: {0}'.format(e))

    if not list_filepaths:
        logger.warning('no input weights matrices found')
        return
Exemple #28
0
def input2node_distribution(directory):
    X, y = get_mnist(directory)

    X /= 255.

    pca = PCA(n_components=784).fit(X)
    X_pca = np.matmul(X, pca.components_.T)

    list_activation = ['tanh', 'relu', 'bounded_relu']
    list_train = [X, X_pca]

    fig, axs = plt.subplots(nrows=2, ncols=3)

    for idx_activation in range(len(list_activation)):
        activation = list_activation[idx_activation]

        for idx_train in range(len(list_train)):
            ax = axs[idx_train, idx_activation]
            train = list_train[idx_train]

            if activation in ['tanh', '']:
                i2n = InputToNode(hidden_layer_size=1,
                                  random_state=82,
                                  input_scaling=50 / 784,
                                  bias_scaling=0.,
                                  activation=activation)
            elif activation in ['relu', 'bounded_relu']:
                i2n = InputToNode(hidden_layer_size=1,
                                  random_state=82,
                                  input_scaling=1.,
                                  bias_scaling=0.,
                                  activation=activation)

            node_out = i2n.fit_transform(train, y)
            hist, bin_edges = np.histogram(node_out, bins=20, density=True)

            np.delete(bin_edges[:-1], hist <= 1e-3)
            np.delete(hist, hist <= 1e-3)

            if activation == 'bounded_relu':
                ax.hist(node_out,
                        label=activation,
                        density=True,
                        bins=[.0, .1, .9, 1.],
                        color=tud_colors['lightblue'])
            else:
                ax.hist(node_out,
                        label=activation,
                        density=True,
                        bins=20,
                        color=tud_colors['lightblue'])

            ax.grid(axis='y')
            ax.set_yscale('log')

            x_ticks = np.min(node_out), np.max(node_out)
            ax.set_xlim(x_ticks)

            if activation == 'tanh':
                x_ticks += (0.0, )
            ax.set_xticks(x_ticks)
            ax.set_xticklabels(
                ['{0:.1f}'.format(x_tick) for x_tick in x_ticks])

    axs[0, 0].set_title('tanh, orig.')
    axs[0, 1].set_title('relu, orig.')
    axs[0, 2].set_title('b. relu, orig.')
    axs[1, 0].set_title('tanh, pca')
    axs[1, 1].set_title('relu, pca')
    axs[1, 2].set_title('b. relu, pca')

    # plt.tight_layout()
    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'node-out.pdf'), format='pdf')
    fig.savefig(os.path.join(directory, 'node-out.eps'), format='eps')
    plt.rc('pgf', texsystem='pdflatex')
Exemple #29
0
def picture_gradient(directory):
    self_name = 'picture_gradient'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # scale X so X in [0, 1]
    X /= 255.

    # reshape X
    X_images = X.reshape((X.shape[0], 28, 28))

    list_kernels = [{
        'name':
        'laplace',
        'kernel':
        np.array([[-1., -1., -1.], [-1., 8, -1.], [-1., -1., -1.]])
    }, {
        'name':
        'mexicanhat',
        'kernel':
        np.array([[0., 0., -1., 0., 0.], [0., -1., -2., -1., 0.],
                  [-1., -2., 16, -2., -1.], [0., -1., -2., -1., 0.],
                  [0., 0., -1., 0., 0.]])
    }, {
        'name':
        'v_prewitt',
        'kernel':
        np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]])
    }, {
        'name':
        'h_prewitt',
        'kernel':
        np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]]).T
    }, {
        'name':
        'v_sobel',
        'kernel':
        np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]])
    }, {
        'name':
        'h_sobel',
        'kernel':
        np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]]).T
    }]

    example_image_idx = 5

    fig, axs = plt.subplots(1, 4, figsize=(6, 2))
    axs[0].imshow(X_images[example_image_idx],
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[0].set_title('no filter')
    axs[1].imshow(convolve2d(X_images[example_image_idx],
                             list_kernels[0]['kernel'],
                             mode='same'),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[1].set_title('laplace')
    axs[2].imshow(convolve2d(X_images[example_image_idx],
                             list_kernels[2]['kernel'],
                             mode='same'),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[2].set_title('vertical\nprewitt')
    axs[3].imshow(convolve2d(X_images[example_image_idx],
                             list_kernels[5]['kernel'],
                             mode='same'),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[3].set_title('horizontal\nsobel')

    for ax in axs:
        ax.set_xticks([0, 27])
        ax.set_xticklabels([0, 27])
        ax.set_yticks([0, 27])
        ax.set_yticklabels([0, 27])

    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-image-filters.pdf'),
                format='pdf')
    fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-image-filters.pgf'),
                format='pgf')