コード例 #1
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def silhouette_kcluster(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_kcluster', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    X /= 255.

    pca = PCA(n_components=50, whiten=False, random_state=42).fit(X)

    # PCA preprocessed
    X_pca = pca.transform(X)

    k = [10, 15, 20, 25, 30, 35, 50, 100, 200]

    dict_results = {
        'n_clusters': [],
        'pca_n_components': [],
        'pca_expl_var': [],
        'pca_expl_var_ratio': [],
        'silhouette_kcosine': [],
        'silhouette_kmeans': [],
        'fittime_kcosine': [],
        'fittime_kmeans': []
    }

    for n_clusters in k:
        dict_results['n_clusters'].append(n_clusters)
        dict_results['pca_n_components'].append(pca.n_components_)
        dict_results['pca_expl_var'].append(np.sum(pca.explained_variance_))
        dict_results['pca_expl_var_ratio'].append(
            np.sum(pca.explained_variance_ratio_))

        # kmeans
        clusterer_euclid = KMeans(n_clusters=n_clusters, random_state=42)
        t = time.time()
        clusterer_euclid.fit(X_pca)
        dict_results['fittime_kmeans'].append(time.time() - t)
        dict_results['silhouette_kmeans'].append(
            silhouette_score(X,
                             clusterer_euclid.predict(X_pca),
                             metric='euclidean',
                             random_state=42))

        # kcosine
        clusterer_cosine = KMeans(n_clusters=n_clusters, random_state=42)
        t = time.time()
        clusterer_cosine.fit(X_pca)
        dict_results['fittime_kcosine'].append(time.time() - t)
        dict_results['silhouette_kcosine'].append(
            silhouette_score(X,
                             clusterer_cosine.predict(X_pca),
                             metric='cosine',
                             random_state=42))

    # save results to csv
    with open(os.path.join(directory, 'silhouette_kcluster.csv'), 'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
コード例 #2
0
def plot_variance_mean(directory, *args, **kwargs):
    logger = new_logger('plot_variance_mean', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    image_size = (28, 28)

    scaler = StandardScaler().fit(
        StandardScaler(with_std=False).fit_transform(X) / 255)

    fig, axs = plt.subplots(1, 2, figsize=(8, 4))

    axs[0].imshow(np.resize(scaler.mean_, image_size),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[0].imshow(np.resize(scaler.var_, image_size),
                  cmap=plt.cm.gray_r,
                  interpolation='none')

    axs[0].set_title(r'$\mu$')
    axs[1].set_title(r'$\sigma^2$')

    fig.tight_layout()
    fig.savefig(
        os.path.join(directory, 'mnist-pixel-variance-and-mean-avgfree.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             'mnist-pixel-variance-and-mean-avgfree.pgf'),
                format='pgf')
    logger.info('np.max(scaler.mean_) = {0}, np.max(scaler.var_) = {1}'.format(
        np.max(scaler.mean_), np.max(scaler.var_)))
    return
コード例 #3
0
def main(directory, params=()):
    # workdir
    if not os.path.isdir(directory):
        try:
            os.mkdir(directory)
        except PermissionError as e:
            print('mkdir failed due to missing privileges: {0}'.format(e))
            exit(1)

    # subfolder for results
    file_dir = os.path.join(directory, 'compare_datasets')
    if not os.path.isdir(file_dir):
        os.mkdir(file_dir)

    logger = new_logger('main', directory=file_dir)
    logger.info('Started main with directory={0} and params={1}'.format(
        directory, params))

    # register parameters
    experiment_names = {
        'dataset_imbalance': dataset_imbalance,
    }

    # run specified programs
    for param in params:
        if param in experiment_names:
            experiment_names[param](file_dir)
        else:
            logger.warning('Parameter {0} invalid/not found.'.format(param))
コード例 #4
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_bip(directory):
    self_name = 'elm_bip'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # preprocessing
    X /= 255.
    pca = PCA(n_components=50).fit(X)
    X_preprocessed = pca.transform(X)
    logger.info('{0} features remaining after preprocessing.'.format(
        X_preprocessed.shape[1]))

    # prepare parameter grid
    param_grid = [{
        'hidden_layer_size': [500, 1000, 2000, 4000],
        'activation': ['tanh'],
        'alpha': [1e-5],
        'random_state': [42]
    }]

    # setup estimator
    estimator = ELMClassifier(input_to_node=BatchIntrinsicPlasticity(),
                              regressor=Ridge())

    # setup grid search
    cv = GridSearchCV(estimator=estimator,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=1,
                      verbose=2,
                      refit=False,
                      cv=[(np.arange(0,
                                     train_size), np.arange(train_size,
                                                            70000))])

    # run!
    cv.fit(X, y_encoded)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    # refine results
    cv_results = cv.cv_results_
    del cv_results['params']

    # save results
    try:
        with open(os.path.join(directory, '{0}.csv'.format(self_name)),
                  'w') as f:
            f.write(','.join(cv_results.keys()) + '\n')
            for row in list(map(list, zip(*cv_results.values()))):
                f.write(','.join(map(str, row)) + '\n')
    except PermissionError as e:
        print('Missing privileges: {0}'.format(e))
コード例 #5
0
def plot_imbalance(directory):
    self_name = 'plot_imbalance'
    logger = new_logger(self_name, directory)
    X, y = get_mnist(directory)
    logger.info('successfully fetched {0} datapoints'.format(X.shape[0]))

    tp_y_unique = np.unique(y.astype(int), return_counts=True)
    y_unique = tp_y_unique[0][np.argsort(tp_y_unique[0])]
    y_counts = tp_y_unique[1][np.argsort(tp_y_unique[0])]

    # y_hist_arr = np.array(y_hist, dtype=float)

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6., 2.1))

    for idx in range(y_counts.size):
        plt.text(idx * 1.,
                 3500,
                 '{0:.1f}%'.format(y_counts[idx] / np.sum(y_counts) * 100),
                 color=(1., 1., 1., .2),
                 fontsize='small',
                 horizontalalignment='center')
        # w = bar.get_with()
        # plt.text(bar.get_x() - .04, bar.get_y() + .1, '{0:.1f}%'.format())

    ax.set_xlim([-.5, 9.5])
    ax.set_xticks(y_unique)
    ax.set_xticklabels(['{0:.0f}'.format(idx) for idx in y_unique])
    ax.set_xlabel('label')

    ax.set_ylim([0, 8000])
    ax.set_yticks([7000], minor=True)
    ax.grid(which='minor',
            axis='y',
            alpha=.7,
            linestyle='--',
            color=tud_colors['lightgreen'])
    ax.set_ylabel(r'\#occurrences')

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='x', which='both', bottom=False, top=False)

    ax.legend(bbox_to_anchor=(1, .5), loc='center left')
    fig.tight_layout()
    # fig.patch.set_visible(False)

    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             '{0}.pgf'.format(self_name)),
                format='pgf')
    fig.savefig(os.path.join(directory, '{0}.pdf'.format(self_name)),
                format='pdf')
    return
コード例 #6
0
def main(out_directory=os.getcwd(), param_list=None):
    logger = new_logger('main', directory=out_directory)
    logger.info('Created logger successfully')

    for module_name in ['scipy', 'numpy', 'sklearn', 'pyrcn']:
        if module_name not in sys.modules:
            logger.error('Module {0} was not loaded'.format(module_name))
        else:
            logger.info('Module {0} loaded'.format(module_name))

    from pyrcn.extreme_learning_machine.tests import test_elm
    test_elm.test_iris_ensemble_iterative_regression()

    logger.info('Test run fished')
    return
コード例 #7
0
def main(out_path=os.path.join(os.getcwd(), 'preprocessing-mnist'),
         function_name='labels'):
    if not os.path.exists(out_path):
        try:
            os.makedirs(out_path)
        except OSError as error:
            print(error)

    # quick and dirty
    # directory = os.path.join(os.getcwd(), 'preprocessing-mnist')
    directory = out_path

    logger = new_logger('main')
    logger.info('{0} called, entering main'.format(__file__))

    runtime = [time.time()]

    # fetch data
    X, y = get_mnist()

    runtime.append(time.time())
    logger.info('fetch: {0} s'.format(np.diff(runtime[-2:])))
    logger.info('X.shape = {0}, y.shape = {1}'.format(X.shape, y.shape))

    function_dict = {
        'labels': plot_labels,
        'plot_pooling': plot_pooling,
        'plot_poster': plot_poster,
        'histogram': plot_historgram,
        'var': plot_var,
        'normalized': plot_normalized,
        'variance_mean': plot_variance_mean,
        'image_min_var': plot_image_min_var,
        'plot_pca': plot_pca,
        'plot_covariance': plot_covariance,
        'plot_imbalance': plot_imbalance,
        'plot_img_cluster': plot_img_cluster,
    }

    if function_name in function_dict:
        function_dict[function_name](directory)
    else:
        logger.warning('no function {0} found'.format(function_name))

    logger.info('{0} finished, return from main'.format(__file__))
コード例 #8
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def main(directory, params):
    # workdir
    if not os.path.isdir(directory):
        try:
            os.mkdir(directory)
        except PermissionError as e:
            print('mkdir failed due to missing privileges: {0}'.format(e))
            exit(1)

    # subfolder for results
    file_dir = os.path.join(directory, 'mnist-elm')
    if not os.path.isdir(file_dir):
        os.mkdir(file_dir)

    logger = new_logger('main', directory=file_dir)
    logger.info('Started main with directory={0} and params={1}'.format(
        directory, params))

    # register parameters
    experiment_names = {
        'train_kmeans': train_kmeans,
        'elm_hyperparameters': elm_hyperparameters,
        'elm_basic': elm_basic,
        'elm_pca': elm_pca,
        'elm_preprocessed': elm_preprocessed,
        'elm_random_state': elm_random_state,
        'elm_hidden_layer_size': elm_hidden_layer_size,
        'elm_coates': elm_coates,
        'elm_coates_stacked': elm_coates_stacked,
        'silhouette_n_clusters': silhouette_n_clusters,
        'silhouette_subset': silhouette_subset,
        'silhouette_kcluster': silhouette_kcluster,
        'silhouette_features': silhouette_features
    }

    # run specified programs
    for param in params:
        if param in experiment_names:
            experiment_names[param](file_dir)
        else:
            logger.warning('Parameter {0} invalid/not found.'.format(param))
コード例 #9
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_coates(directory):
    self_name = 'elm_coates'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    filepath_label_encoder = os.path.join(
        directory, 'label_encoder_{0}.pickle'.format(self_name))

    # save label_encoder
    try:
        with open(filepath_label_encoder, 'wb') as f:
            pickle.dump(label_encoder, f)
    except Exception as e:
        logger.error('Unexpected error: {0}'.format(e))
        exit(1)

    # scale X so X in [0, 1]
    X /= 255.

    X_train, X_test, y_train, y_test = (X[:train_size, ...], X[train_size:],
                                        y_encoded[:train_size],
                                        y_encoded[train_size:])

    csv_filepath = os.path.join(directory, '{0}.csv'.format(self_name))

    # read input matrices from files
    list_filepaths = []
    for filepath in glob.glob(
            os.path.join(directory, '*pca*+kmeans*_matrix.npy')):
        logger.info('matrix file found: {0}'.format(filepath))
        list_filepaths.append(filepath)
        filename = os.path.splitext(os.path.basename(filepath))[0]

        est_filepath = os.path.join(directory,
                                    'est_coates-{0}.pickle'.format(filename))
        pred_filpath = os.path.join(
            directory, 'est_coates-{0}-predicted.npz'.format(filename))

        # only if files do not exist yet
        if (not os.path.isfile(csv_filepath)
                or not os.path.isfile(est_filepath)
                or not os.path.isfile(pred_filpath)):
            # setup estimator
            estimator = ELMClassifier(
                input_to_node=PredefinedWeightsInputToNode(
                    predefined_input_weights=np.load(filepath),
                    input_scaling=1.0,
                    bias_scaling=0.0,
                    input_activation='relu',
                    random_state=42),
                chunk_size=1000)
            logger.info('Estimator params: {0}'.format(
                estimator.get_params().keys()))

            # !run
            time_start = time.time()
            estimator.fit(X_train, y_train)
            time_fitted = time.time()
            y_pred = estimator.predict(X_test)
            time_predicted = time.time()
            # !run

            # results
            dict_results = estimator.get_params()
            dict_results.update({
                'filename': filename,
                'fit_time': time_fitted - time_start,
                'score_time': time_predicted - time_fitted,
                'score': accuracy_score(y_test, y_pred)
            })

            # drop data
            dict_results.pop('input_to_nodes__predefined_input_weights')
            dict_results.pop('input_to_nodes')
            dict_results.pop('regressor')

            logger.info('fitted time {1}, score on test set: {0}'.format(
                dict_results['score'], dict_results['fit_time']))

            # save estimator
            try:
                with open(est_filepath, 'wb') as f:
                    pickle.dump(estimator, f)
            except Exception as e:
                logger.error('Unexpected error: {0}'.format(e))
                exit(1)

            # save results
            try:
                if not os.path.isfile(csv_filepath):
                    with open(csv_filepath, 'a') as f:
                        f.write(','.join(dict_results.keys()))
                        f.write('\n')
                        f.write(','.join(
                            [str(item) for item in dict_results.values()]))
                        f.write('\n')
                else:
                    with open(csv_filepath, 'a') as f:
                        f.write(','.join(
                            [str(item) for item in dict_results.values()]))
                        f.write('\n')
            except PermissionError as e:
                print('Missing privileges: {0}'.format(e))

            # save prediction
            np.savez_compressed(pred_filpath,
                                X_test=X_test,
                                y_test=label_encoder.inverse_transform(y_test),
                                y_pred=label_encoder.inverse_transform(y_pred))

    if not list_filepaths:
        logger.warning('no input weights matrices found')
        return
コード例 #10
0
def dataset_imbalance(directory, *args, **kwargs):
    self_name = 'dataset_imbalance'
    logger = new_logger(self_name, directory)
    logger.info('Entering {0}'.format(self_name))

    list_dict_datasets = [{
        'name': 'abalone19',
        'id': 41357
    }, {
        'name': 'abalone',
        'id': 1557
    }, {
        'name': 'mnist_784',
        'id': 554
    }, {
        'name': 'iris',
        'id': 61
    }]

    for dict_dataset in list_dict_datasets:
        filepath = os.path.join(directory,
                                '{0}.npz'.format(dict_dataset['name']))
        if os.path.isfile(filepath):
            logger.info('Loading {0}'.format(filepath))
            npzfile = np.load(filepath, allow_pickle=True)
            X, y = npzfile['X'], npzfile['y']
        else:
            logger.info('Fetching {0}'.format(dict_dataset['name']))
            try:
                frame = fetch_openml(data_id=dict_dataset['id'], as_frame=True)
                X, y = frame['data'], frame['target']
                np.savez(filepath, X=X, y=y)
            except Exception as e:
                logger.warning(
                    'Failed to load and save {0}, due to error {1}'.format(
                        dict_dataset['name'], e))
                continue

        label_encoder = LabelEncoder().fit(y)
        labels, label_frequency = np.unique(label_encoder.transform(y),
                                            return_counts=True)
        ir = np.min(label_frequency) / np.max(label_frequency)
        entropy = scipy.stats.entropy(label_frequency, base=2)
        max_possible_entropy = scipy.stats.entropy(np.ones(
            label_frequency.shape),
                                                   base=2)

        dict_dataset.update({
            'filepath': filepath,
            'labels': label_encoder.classes_,
            'labels_nbr': labels,
            'label_frequency': label_frequency,
            'imbalance_ratio': ir,
            'entropy': entropy,
            'max_possible_entropy': max_possible_entropy,
            'entropy_ratio': entropy / max_possible_entropy,
            'features': labels.size,
        })

    filepath = os.path.join(directory, '{0}.csv'.format(self_name))
    with open(filepath, 'w', newline='') as f:
        dict_writer = csv.DictWriter(f, list_dict_datasets[0].keys())
        dict_writer.writeheader()
        dict_writer.writerows(list_dict_datasets)
    return
コード例 #11
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def silhouette_subset(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_subset', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    X /= 255.

    pca = PCA(n_components=50, whiten=False, random_state=42)

    # preprocessing
    X_pca = pca.fit_transform(X)

    # define subset sizes
    subset_sizes = [250, 500, 1000, 2000, 4000, 8000, 16000, 32000, 60000]

    # number of centroids
    k_list = [20]

    dict_results = {
        'subset_size': [],
        'k': [],
        'n_init': [],
        'silhouette_raninit': [],
        'silhouette_preinit': [],
        'fittime_raninit': [],
        'fittime_preinit': [],
        'scoretime_raninit': [],
        'scoretime_preinit': []
    }

    for k in k_list:
        # preinit
        # initial training set
        X_train, X_test, y_train, y_test = train_test_split(
            X_pca,
            y,
            random_state=42,
            train_size=subset_sizes[0],
            shuffle=True,
            stratify=y)
        clusterer_init = KMeans(n_clusters=k,
                                random_state=42,
                                init='k-means++',
                                n_init=10).fit(X_train)

        # random inits
        clusterer = KMeans(n_clusters=k, n_init=10, random_state=42)

        for subset_size in subset_sizes:
            # split on subset size
            dict_results['subset_size'].append(subset_size)
            X_train, X_test, y_train, y_test = train_test_split(
                X_pca,
                y,
                random_state=42,
                train_size=subset_size,
                shuffle=True,
                stratify=y)

            # train preinit
            t = time.time()
            clusterer_init = KMeans(n_clusters=k,
                                    random_state=42,
                                    n_init=1,
                                    init=clusterer_init.cluster_centers_)
            clusterer_init.fit_predict(X_train)
            dict_results['fittime_preinit'].append(time.time() - t)

            # score preinit
            t = time.time()
            dict_results['silhouette_preinit'].append(
                silhouette_score(X_train,
                                 clusterer_init.predict(X_train),
                                 metric='euclidean',
                                 random_state=42))
            dict_results['scoretime_preinit'].append(time.time() - t)

            # train randinit
            t = time.time()
            clusterer.fit(X_train)
            dict_results['fittime_raninit'].append(time.time() - t)

            # score raninit
            t = time.time()
            dict_results['silhouette_raninit'].append(
                silhouette_score(X_train,
                                 clusterer.predict(X_train),
                                 metric='euclidean',
                                 random_state=42))
            dict_results['scoretime_raninit'].append(time.time() - t)

            # store results
            dict_results['k'].append(k)
            dict_results['n_init'].append(clusterer.n_init)

            logger.info('silhouette (preinit) at subset size {1}: {0}'.format(
                dict_results['silhouette_preinit'][-1],
                dict_results['subset_size'][-1]))

    # save results to csv
    with open(os.path.join(directory, 'silhouette_kmeans_subset_size.csv'),
              'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
コード例 #12
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_hyperparameters(directory):
    self_name = 'elm_hyperparameters'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    X = X / 255.

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y_encoded, train_size=train_size, random_state=42, shuffle=True)
    X_train, _, y_train, _ = (X[:train_size, :], X[train_size:, :],
                              y_encoded[:train_size], y_encoded[train_size:])

    param_grid = {
        'hidden_layer_size': [2000],
        'input_scaling': np.logspace(start=-2, stop=2, base=10, num=7),
        'bias_scaling': np.logspace(start=-2, stop=2, base=10, num=7),
        'input_activation': ['tanh'],
        'alpha': [1e-5],
        'random_state': [42]
    }

    estimator = ELMClassifier(regressor=Ridge())
    cv = GridSearchCV(estimator,
                      param_grid,
                      cv=5,
                      n_jobs=-1,
                      scoring='accuracy')
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    cv_results = cv.cv_results_
    del cv_results['params']
    with open(os.path.join(directory, '{0}_scaling.csv'.format(self_name)),
              'w') as f:
        f.write(','.join(cv_results.keys()) + '\n')
        for row in list(map(list, zip(*cv_results.values()))):
            f.write(','.join(map(str, row)) + '\n')

    param_grid = {
        'hidden_layer_size': [500, 1000, 2000, 4000],
        'input_scaling': [cv.best_params_['input_scaling']],
        'bias_scaling': [cv.best_params_['bias_scaling']],
        'input_activation':
        ['tanh', 'relu', 'bounded_relu', 'logistic', 'identity'],
        'alpha': [1e-5],
        'random_state': [42]
    }

    cv = GridSearchCV(estimator,
                      param_grid,
                      cv=5,
                      n_jobs=-1,
                      scoring='accuracy')
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    cv_results = cv.cv_results_
    del cv_results['params']
    with open(os.path.join(directory, '{0}_size.csv'.format(self_name)),
              'w') as f:
        f.write(','.join(cv_results.keys()) + '\n')
        for row in list(map(list, zip(*cv_results.values()))):
            f.write(','.join(map(str, row)) + '\n')

    param_grid = {
        'hidden_layer_size': [cv.best_params_['hidden_layer_size']],
        'input_scaling': [cv.best_params_['input_scaling']],
        'bias_scaling': [cv.best_params_['bias_scaling']],
        'input_activation': [cv.best_params_['input_activation']],
        'alpha': [.00001, .001, .1],
        'random_state': [42]
    }

    cv = GridSearchCV(estimator,
                      param_grid,
                      cv=5,
                      n_jobs=1,
                      scoring='accuracy')
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    cv_results = cv.cv_results_
    del cv_results['params']
    with open(os.path.join(directory, '{0}_alpha.csv'.format(self_name)),
              'w') as f:
        f.write(','.join(cv_results.keys()) + '\n')
        for row in list(map(list, zip(*cv_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
コード例 #13
0
def plot_historgram(directory, *args, **kwargs):
    logger = new_logger('plot_historgram', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    fig, axs = plt.subplots(1,
                            2,
                            figsize=(5, 2),
                            gridspec_kw={'width_ratios': [1, 1.7]})

    example = np.zeros((28, 28, 3))
    example[..., 0] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # red
    example[..., 1] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # green
    example[..., 2] = 1. - np.resize(X[example_image_idx, :],
                                     (28, 28)) / 255.  # blue

    idx_fringe = (25, 17)
    idx_center = (13, 12)

    example[idx_center[0], idx_center[1], :] = tud_colors['lightblue'][:-1]
    example[idx_fringe[0], idx_fringe[1], :] = tud_colors['orange'][:-1]

    bins = np.array(range(0, 287, 32)).astype(int)

    hist_fringe, bin_edges = np.histogram(X[:, idx_fringe[0] * 28 +
                                            idx_fringe[1]],
                                          bins=bins)
    hist_center, bin_edges = np.histogram(X[:, idx_center[0] * 28 +
                                            idx_center[1]],
                                          bins=bins)

    logger.info('validation sum hist_fringe: {0}, sum hist_center: {1}'.format(
        np.sum(hist_fringe / 1000), np.sum(hist_center / 1000)))

    axs[0].imshow(example, interpolation='none')
    axs[0].set_xticks([0, 27])
    axs[0].set_xticklabels([0, 27])
    axs[0].set_yticks([0, 27])
    axs[0].set_yticklabels([0, 27])

    axs[1].bar(bins[1:] - 32,
               height=hist_fringe / 1000,
               width=16,
               color=tud_colors['orange'],
               label='fringe',
               align='edge')
    axs[1].bar(bins[1:] - 16,
               height=hist_center / 1000,
               width=16,
               color=tud_colors['lightblue'],
               label='center',
               align='edge')
    axs[1].tick_params(axis='x', labelrotation=90)
    # axs[1].hist([], bins=range(0, 255, 32), color=[tud_colors['orange'],
    #                                                tud_colors['lightblue']],
    #             align='left')

    axs[1].set_xticks(bins)
    # axs[1].legend(bbox_to_anchor=(0, 1, 1, 0), loc="lower left",
    # mode="expand", ncol=2)
    axs[1].legend(bbox_to_anchor=(1.0, .5), loc="center left")

    # fig.suptitle('Feature distribution in MNIST picture')
    axs[1].set_xlabel('value bins')
    axs[1].set_ylabel('probability')
    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-pixel-histogram.pdf'))
    fig.savefig(os.path.join(os.environ['PGFPATH'],
                             'mnist-pixel-histogram.pgf'),
                format='pgf')
    # plt.show()
    return
コード例 #14
0
def picture_gradient(directory):
    self_name = 'picture_gradient'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # scale X so X in [0, 1]
    X /= 255.

    # reshape X
    X_images = X.reshape((X.shape[0], 28, 28))

    list_kernels = [{
        'name':
        'laplace',
        'kernel':
        np.array([[-1., -1., -1.], [-1., 8, -1.], [-1., -1., -1.]])
    }, {
        'name':
        'mexicanhat',
        'kernel':
        np.array([[0., 0., -1., 0., 0.], [0., -1., -2., -1., 0.],
                  [-1., -2., 16, -2., -1.], [0., -1., -2., -1., 0.],
                  [0., 0., -1., 0., 0.]])
    }, {
        'name':
        'v_prewitt',
        'kernel':
        np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]])
    }, {
        'name':
        'h_prewitt',
        'kernel':
        np.array([[-1., -1., -1.], [0., 0., 0.], [1., 1., 1.]]).T
    }, {
        'name':
        'v_sobel',
        'kernel':
        np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]])
    }, {
        'name':
        'h_sobel',
        'kernel':
        np.array([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]]).T
    }]

    example_image_idx = 5

    fig, axs = plt.subplots(1, 4, figsize=(6, 2))
    axs[0].imshow(X_images[example_image_idx],
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[0].set_title('no filter')
    axs[1].imshow(convolve2d(X_images[example_image_idx],
                             list_kernels[0]['kernel'],
                             mode='same'),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[1].set_title('laplace')
    axs[2].imshow(convolve2d(X_images[example_image_idx],
                             list_kernels[2]['kernel'],
                             mode='same'),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[2].set_title('vertical\nprewitt')
    axs[3].imshow(convolve2d(X_images[example_image_idx],
                             list_kernels[5]['kernel'],
                             mode='same'),
                  cmap=plt.cm.gray_r,
                  interpolation='none')
    axs[3].set_title('horizontal\nsobel')

    for ax in axs:
        ax.set_xticks([0, 27])
        ax.set_xticklabels([0, 27])
        ax.set_yticks([0, 27])
        ax.set_yticklabels([0, 27])

    fig.tight_layout()
    fig.savefig(os.path.join(directory, 'mnist-image-filters.pdf'),
                format='pdf')
    fig.savefig(os.path.join(os.environ['PGFPATH'], 'mnist-image-filters.pgf'),
                format='pgf')
コード例 #15
0
ファイル: test_util.py プロジェクト: TUD-STKS/PyRCN
def test_new_logger() -> None:
    directory = os.getcwd()
    logger = new_logger(name='test_logger', directory=directory)
    logger.info('Test')
    assert os.path.isfile(os.path.join(directory, 'test_logger.log'))
コード例 #16
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def silhouette_n_clusters(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_n_clusters', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    scaler = StandardScaler().fit(X)
    X /= 255.

    pca = PCA(n_components=50, whiten=False, random_state=42).fit(X)
    min_var = 3088.6875

    # reduce train size
    # X = X[:10000, ...]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y_encoded,
                                                        train_size=10000,
                                                        random_state=42)

    # variance threshold
    X_var_threshold = X_train[..., scaler.var_ > min_var]

    # pca
    X_pca = pca.transform(X_train)

    # n_clusters
    k = [
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 25, 30, 40,
        50, 60, 70, 80, 90, 100, 200, 500, 1000, 2000, 4000
    ]

    # n_init
    n_init = 10

    dict_results = {
        'n_clusters': [],
        'n_init': [],
        'variance_threshold': [],
        'pca_n_components': [],
        'pca_explained_variance': [],
        'pca_explained_variance_ratio': [],
        'silhouette_original': [],
        'silhouette_variance_threshold': [],
        'silhouette_pca': [],
        'fittime_original': [],
        'fittime_variance_threshold': [],
        'fittime_pca': [],
        'inertia_original': [],
        'inertia_variance_threshold': [],
        'inertia_pca': [],
        'n_iter_original': [],
        'n_iter_variance_threshold': [],
        'n_iter_pca': []
    }

    for n_clusters in k:
        dict_results['n_clusters'].append(n_clusters)
        dict_results['n_init'].append(n_init)
        dict_results['variance_threshold'].append(min_var)
        dict_results['pca_n_components'].append(pca.n_components_)
        dict_results['pca_explained_variance'].append(
            np.sum(pca.explained_variance_))
        dict_results['pca_explained_variance_ratio'].append(
            np.sum(pca.explained_variance_ratio_))

        clusterer = MiniBatchKMeans(n_clusters=n_clusters,
                                    init='k-means++',
                                    n_init=n_init,
                                    random_state=42)

        # original
        t = time.time()
        clusterer.fit(X_train)
        dict_results['fittime_original'].append(time.time() - t)
        dict_results['inertia_original'].append(clusterer.inertia_)
        dict_results['n_iter_original'].append(clusterer.n_iter_)
        dict_results['silhouette_original'].append(
            silhouette_score(X_train,
                             clusterer.predict(X_train),
                             metric='euclidean',
                             random_state=42))

        np.save('./cluster_critical.npy', clusterer.cluster_centers_)

        # var threshold
        t = time.time()
        clusterer.fit(X_var_threshold)
        dict_results['fittime_variance_threshold'].append(time.time() - t)
        dict_results['inertia_variance_threshold'].append(clusterer.inertia_)
        dict_results['n_iter_variance_threshold'].append(clusterer.n_iter_)
        dict_results['silhouette_variance_threshold'].append(
            silhouette_score(X_train,
                             clusterer.predict(X_var_threshold),
                             metric='euclidean',
                             random_state=42))

        # pca
        t = time.time()
        clusterer.fit(X_pca)
        dict_results['fittime_pca'].append(time.time() - t)
        dict_results['inertia_pca'].append(clusterer.inertia_)
        dict_results['n_iter_pca'].append(clusterer.n_iter_)
        dict_results['silhouette_pca'].append(
            silhouette_score(X_train,
                             clusterer.predict(X_pca),
                             metric='euclidean',
                             random_state=42))

        logger.info('n_clusters = {0}, pca kmeans score: {1}'.format(
            n_clusters, dict_results['silhouette_pca'][-1]))
        logger.info('n_clusters = {0}'.format(n_clusters))

    # save results to csv
    with open(os.path.join(directory, 'silhouette_n_clusters.csv'), 'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
コード例 #17
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_coates_stacked(directory):
    self_name = 'elm_coates_stacked'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # scale X so X in [0, 1]
    X /= 255.

    # setup parameter grid
    param_grid = {
        'chunk_size': [10000],
        'input_scaling': np.logspace(start=-3, stop=1, base=10, num=3),
        'bias_scaling': [0.],  # np.logspace(start=-3, stop=1, base=10, num=6),
        'input_activation': ['relu'],
        'alpha': [1e-5],
        'random_state': [42]
    }

    # read input matrices from files
    list_filepaths = []
    predefined_input_weights = np.empty((784, 0))
    for filepath in glob.glob(os.path.join(directory, '*kmeans1*matrix.npy')):
        logger.info('matrix file found: {0}'.format(filepath))
        list_filepaths.append(filepath)
        predefined_input_weights = np.append(predefined_input_weights,
                                             np.load(filepath),
                                             axis=1)

    # setup estimator
    estimator = ELMClassifier(
        PredefinedWeightsInputToNode(
            predefined_input_weights=predefined_input_weights),
        IncrementalRegression())
    logger.info('Estimator params: {0}'.format(estimator.get_params().keys()))
    # return

    # setup grid search
    cv = GridSearchCV(estimator=estimator,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=1,
                      verbose=1,
                      cv=[(np.arange(0,
                                     train_size), np.arange(train_size,
                                                            70000))])

    # run!
    cv.fit(X, y_encoded)
    cv_best_params = cv.best_params_
    del cv_best_params['input_to_nodes__predefined_input_weights']

    # refine best params
    logger.info('best parameters: {0} (score: {1})'.format(
        cv_best_params, cv.best_score_))

    # refine results
    cv_results = cv.cv_results_
    del cv_results['params']
    del cv_results['param_input_to_nodes__predefined_input_weights']

    # save results
    try:
        with open(os.path.join(directory, '{0}.csv'.format(self_name)),
                  'w') as f:
            f.write(','.join(cv_results.keys()) + '\n')
            for row in list(map(list, zip(*cv_results.values()))):
                f.write(','.join(map(str, row)) + '\n')
    except PermissionError as e:
        print('Missing privileges: {0}'.format(e))

    if not list_filepaths:
        logger.warning('no input weights matrices found')
        return
コード例 #18
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def silhouette_features(directory, *args, **kwargs):
    logger = new_logger('plot_silhouette_features', directory)
    logger.info('entering')
    X, y = get_mnist(directory)

    X /= 255.

    X = X[:10000, ...]

    scaler = StandardScaler().fit(X)
    pca = PCA(whiten=False, random_state=42).fit(X)

    X_pca = pca.transform(X)

    # sort scaler variances
    variance_indices = np.argsort(scaler.var_)[::-1]

    n_features_list = [
        1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70,
        80, 90, 100, 200, 300, 400, 500, 600, 700, 784
    ]

    rs = np.random.RandomState(42)

    k = 20

    dict_results = {
        'nfeatures': [],
        'fittime_random': [],
        'fittime_maxvar': [],
        'fittime_pca': [],
        'silhouette_random': [],
        'silhouette_maxvar': [],
        'silhouette_pca': [],
        'explainvar_random': [],
        'explainvar_maxvar': [],
        'explainvar_pca': [],
        'explvarrat_random': [],
        'explvarrat_maxvar': [],
        'explvarrat_pca': [],
        'n_clusters': [],
    }

    for n_features in n_features_list:
        clusterer = KMeans(n_clusters=k, random_state=42)
        dict_results['nfeatures'].append(n_features)
        dict_results['n_clusters'].append(clusterer.n_clusters)

        indices = rs.choice(X.shape[1], size=n_features)
        t = time.time()
        pred = clusterer.fit_predict(X[:, indices])
        dict_results['fittime_random'].append(time.time() - t)
        dict_results['silhouette_random'].append(
            silhouette_score(X, pred, metric='euclidean', random_state=42))
        dict_results['explainvar_random'].append(np.sum(scaler.var_[indices]))
        dict_results['explvarrat_random'].append(
            np.sum(scaler.var_[indices]) / np.sum(scaler.var_))

        t = time.time()
        indices = variance_indices[:n_features]
        pred = clusterer.fit_predict(X[:, indices])
        dict_results['fittime_maxvar'].append(time.time() - t)
        dict_results['silhouette_maxvar'].append(
            silhouette_score(X, pred, metric='euclidean', random_state=42))
        dict_results['explainvar_maxvar'].append(np.sum(scaler.var_[indices]))
        dict_results['explvarrat_maxvar'].append(
            np.sum(scaler.var_[indices]) / np.sum(scaler.var_))

        t = time.time()
        pred = clusterer.fit_predict(X_pca[:, :n_features])
        dict_results['fittime_pca'].append(time.time() - t)
        dict_results['silhouette_pca'].append(
            silhouette_score(X, pred, metric='euclidean', random_state=42))
        dict_results['explainvar_pca'].append(
            np.sum(pca.explained_variance_[:n_features]))
        dict_results['explvarrat_pca'].append(
            np.sum(pca.explained_variance_ratio_[:n_features]))

        logger.info('pca silhouette at n_features={1:.0f}: {0}'.format(
            dict_results['silhouette_pca'][-1], n_features))

    # save results to csv
    with open(
            os.path.join(directory,
                         'silhouette_kmeans{0:.0f}_features.csv'.format(k)),
            'w') as f:
        f.write(','.join(dict_results.keys()) + '\n')
        for row in list(map(list, zip(*dict_results.values()))):
            f.write(','.join(map(str, row)) + '\n')
    return
コード例 #19
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_hidden_layer_size(directory):
    self_name = 'elm_hidden_layer_size'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    # encode y
    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # scale X
    X /= 255.

    # split train test
    X_train, X_test, y_train, y_test = (X[:train_size, :], X[train_size:, :],
                                        y_encoded[:train_size],
                                        y_encoded[train_size:])

    # fan-out from paper
    fan_out = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20]

    # prepare parameter grids
    param_grid_basic = {
        'hidden_layer_size': 0,
        'input_scaling': 1.,
        'bias_scaling': 0.,
        'activation': 'relu',
        'chunk_size': 1000,
        'alpha': 1e-5,
        'random_state': 42
    }

    param_grid_pca = {
        'hidden_layer_size': 0,
        'input_scaling': 1.,
        'bias_scaling': 0.,
        'activation': 'relu',
        'chunk_size': 1000,
        'alpha': 1e-5,
        'random_state': 42
    }

    # setup estimator
    estimator = ELMClassifier()

    # basic
    try:
        # initialize filepath
        csv_filepath = os.path.join(directory,
                                    '{0}_basic.csv'.format(self_name))

        # initialize param dict
        param_dict_job = estimator.get_params().copy()
        param_dict_job.update(param_grid_basic)

        # initialize results dict
        results_dict_job = param_dict_job.copy()
        # add dummy results
        results_dict_job.update({'time_fit': 0, 'time_pred': 0, 'score': 0})

        # write header
        with open(csv_filepath, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
            writer.writeheader()

        for hls in 784 * np.array(fan_out):
            param_dict_job.update({'hidden_layer_size': hls})
            estimator.set_params(**param_dict_job)

            # run!
            time_start = time.time()
            estimator.fit(X_train, y_train)
            time_fit = time.time()
            y_pred = estimator.predict(X_test)
            time_pred = time.time()
            # run end!

            results_dict_job.update(estimator.get_params())

            results_dict_job.update({
                'time_fit': time_fit - time_start,
                'time_pred': time_pred - time_fit,
                'score': accuracy_score(y_test, y_pred)
            })

            logger.info('hidden_layer_size: {0}, score: {1}'.format(
                hls, results_dict_job['score']))

            with open(csv_filepath, 'a') as f:
                writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
                writer.writerow(results_dict_job)

            del estimator.input_to_node._hidden_layer_state

            with open(
                    os.path.join(directory,
                                 'elmc_hls{0}_basic.pickle'.format(hls)),
                    'wb') as f:
                pickle.dump(estimator, f)
    except MemoryError as e:
        logger.error('Memory error: {0}'.format(e))
        pass
    except PermissionError as e:
        logger.error('Missing privileges: {0}'.format(e))
        pass

    # preprocessing pca
    try:
        # initialize filepath
        csv_filepath = os.path.join(directory, '{0}_pca.csv'.format(self_name))

        # preprocessing
        pca50 = PCA(n_components=50).fit(X_train)
        X_train_pca50, X_test_pca50 = (pca50.transform(X_train),
                                       pca50.transform(X_test))

        pca100 = PCA(n_components=100).fit(X_train)
        X_train_pca100, X_test_pca100 = (pca100.transform(X_train),
                                         pca100.transform(X_test))

        list_dict_pca = [{
            'n_components': 50,
            'X_train': X_train_pca50,
            'X_test': X_test_pca50
        }, {
            'n_components': 100,
            'X_train': X_train_pca100,
            'X_test': X_test_pca100
        }]
        logger.info('Preprocessing successful!')

        # initialize param dict
        param_dict_job = estimator.get_params().copy()
        param_dict_job.update(param_grid_pca)

        # initialize results dict
        results_dict_job = param_dict_job.copy()
        # add dummy results
        results_dict_job.update({
            'time_fit': 0,
            'time_pred': 0,
            'score': 0,
            'pca_n_components': 0
        })

        # write header
        with open(csv_filepath, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
            writer.writeheader()

        for dict_pca in list_dict_pca:
            results_dict_job.update(
                {'pca_n_components': dict_pca['n_components']})
            for hls in np.concatenate(
                (100 * np.array(fan_out), 784 * np.array(fan_out)), axis=0):
                param_dict_job.update({'hidden_layer_size': hls})
                estimator.set_params(**param_dict_job)

                # run!
                time_start = time.time()
                estimator.fit(dict_pca['X_train'], y_train)
                time_fit = time.time()
                y_pred = estimator.predict(dict_pca['X_test'])
                time_pred = time.time()
                # run end!

                results_dict_job.update(estimator.get_params())

                results_dict_job.update({
                    'time_fit': time_fit - time_start,
                    'time_pred': time_pred - time_fit,
                    'score': accuracy_score(y_test, y_pred)
                })

                logger.info(
                    'n_components: {2}, hidden_layer_size: {0}, score:'
                    ' {1}'.format(hls, results_dict_job['score'],
                                  results_dict_job['pca_n_components']))

                with open(csv_filepath, 'a') as f:
                    writer = csv.DictWriter(f,
                                            fieldnames=results_dict_job.keys())
                    writer.writerow(results_dict_job)

                with open(
                        os.path.join(
                            directory, 'elmc_hls{0}_pca{1}.pickle'.format(
                                hls, results_dict_job['pca_n_components'])),
                        'wb') as f:
                    pickle.dump(estimator, f)
    except MemoryError as e:
        logger.error('Memory error: {0}'.format(e))
        pass
    except PermissionError as e:
        logger.error('Missing privileges: {0}'.format(e))
        pass
コード例 #20
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_preprocessed(directory):
    self_name = 'elm_preprocessed'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    label_encoder = LabelEncoder().fit(y)
    y_encoded = label_encoder.transform(y)

    # preprocessing
    X /= 255.
    pca = PCA(n_components=50).fit(X)
    X_preprocessed = pca.transform(X)
    logger.info('{0} features remaining after preprocessing.'.format(
        X_preprocessed.shape[1]))

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed,
                                                        y_encoded,
                                                        train_size=train_size,
                                                        random_state=42)

    # prepare parameter grid
    param_grid = [{
        'hidden_layer_size': [500, 2000],
        'input_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'bias_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'input_activation': ['relu'],
        'alpha': [1e-5],
        'random_state': [42]
    }, {
        'hidden_layer_size': [2000],
        'input_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'bias_scaling':
        np.logspace(start=-3, stop=1, base=10, num=6),
        'input_activation': ['tanh'],
        'alpha': [1e-5],
        'random_state': [42]
    }]

    # setup estimator
    estimator = ELMClassifier(regressor=Ridge())

    # setup grid search
    cv = GridSearchCV(estimator=estimator,
                      param_grid=param_grid,
                      scoring='accuracy',
                      n_jobs=1,
                      verbose=2,
                      refit=False,
                      cv=StratifiedShuffleSplit(n_splits=1,
                                                test_size=1 / 7,
                                                random_state=42))

    # run!
    cv.fit(X_train, y_train)
    logger.info('best parameters: {0} (score: {1})'.format(
        cv.best_params_, cv.best_score_))

    # refine results
    cv_results = cv.cv_results_
    del cv_results['params']

    # save results
    try:
        with open(os.path.join(directory, 'elm_preprocessed.csv'), 'w') as f:
            f.write(','.join(cv_results.keys()) + '\n')
            for row in list(map(list, zip(*cv_results.values()))):
                f.write(','.join(map(str, row)) + '\n')
    except PermissionError as e:
        print('Missing privileges: {0}'.format(e))
コード例 #21
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def train_kmeans(directory):
    self_name = 'train_kmeans'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    # scale X, so $X \in [0, 1]$
    X /= 255.

    list_n_components = [50]  # [50, 100]
    # [20, 50, 100, 200, 500, 1000, 2000, 4000, 8000, 16000]
    list_n_clusters = [200]

    for n_components in list_n_components:
        pca = PCA(n_components=n_components, random_state=42).fit(X)
        X_pca = pca.transform(X)
        logger.info('pca{0}: explained variance ratio = {1}'.format(
            n_components, np.sum(pca.explained_variance_ratio_)))

        for n_clusters in list_n_clusters:
            # minibatch kmeans
            kmeans_basename = 'minibatch-pca{0}+kmeans{1}'.format(
                n_components, n_clusters)

            # only if file does not exist
            if not os.path.isfile(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename))):
                clusterer = MiniBatchKMeans(n_clusters=n_clusters,
                                            init='k-means++',
                                            random_state=42,
                                            batch_size=5000,
                                            n_init=5).fit(X_pca)
                np.save(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename)),
                    np.dot(pca.components_.T, clusterer.cluster_centers_.T))

                # assemble pipeline
                p = make_pipeline(pca, clusterer)
                with open(
                        os.path.join(
                            directory,
                            '{0}_pipeline.pickle'.format(kmeans_basename)),
                        'wb') as f:
                    pickle.dump(p, f)

                logger.info(
                    'successfuly trained MiniBatchKMeans'
                    'and saved to npy/pickle {0}'.format(kmeans_basename))

            # original kmeans
            kmeans_basename = 'original-pca{0}+kmeans{1}'.format(
                n_components, n_clusters)

            if n_clusters < 2000 and not os.path.isfile(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename))):
                clusterer = KMeans(n_clusters=n_clusters,
                                   init='k-means++',
                                   random_state=42,
                                   n_init=5).fit(X_pca)
                np.save(
                    os.path.join(directory,
                                 '{0}_matrix.npy'.format(kmeans_basename)),
                    np.dot(pca.components_.T, clusterer.cluster_centers_.T))

                # assemble pipeline
                p = make_pipeline(pca, clusterer)
                with open(
                        os.path.join(
                            directory,
                            '{0}_pipeline.pickle'.format(kmeans_basename)),
                        'wb') as f:
                    pickle.dump(p, f)

                logger.info('successfuly trained KMeans and saved to'
                            'npy/pickle {0}'.format(kmeans_basename))
コード例 #22
0
ファイル: mnist-elm.py プロジェクト: TUD-STKS/PyRCN
def elm_pca(directory):
    self_name = 'elm_pca'
    logger = new_logger(self_name, directory=directory)
    X, y = get_mnist(directory)
    logger.info('Loaded MNIST successfully with {0} records'.format(
        X.shape[0]))

    # scale X
    X /= 255.

    # split train test
    X_train, X_test, y_train, y_test = train_test_split(X[:train_size],
                                                        y[:train_size],
                                                        train_size=50000,
                                                        random_state=42)

    # prepare parameter grids
    param_grid_basic = {
        'hidden_layer_size': 2000,
        'input_scaling': 1.,
        'bias_scaling': 0.,
        'input_activation': 'relu',
        'alpha': 1e-5,
        'random_state': 42
    }

    # setup estimator
    estimator = ELMClassifier(regressor=Ridge())

    # initialize filepath
    filepath = os.path.join(directory, '{0}_basic.csv'.format(self_name))

    # initialize param dict
    param_dict_job = estimator.get_params().copy()
    param_dict_job.update(param_grid_basic)

    # initialize results dict
    results_dict_job = param_dict_job.copy()
    # add dummy results
    results_dict_job.update({
        'time_fit': 0,
        'time_pred': 0,
        'score': 0,
        'pca_n_components': 0
    })

    # preprocessing pca
    try:
        # write header
        with open(filepath, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
            writer.writeheader()

        for pca_n_components in [10, 20, 50, 100, 200, 500, 784]:
            results_dict_job.update({'pca_n_components': pca_n_components})
            estimator.set_params(**param_dict_job)

            # preprocessing
            pca = PCA(n_components=pca_n_components).fit(X_train)
            X_train_pca, X_test_pca = \
                pca.transform(X_train), pca.transform(X_test)

            # run!
            time_start = time.time()
            estimator.fit(X_train_pca, y_train)
            time_fit = time.time()
            y_pred = estimator.predict(X_test_pca)
            time_pred = time.time()
            # run end!

            results_dict_job.update({
                'time_fit': time_fit - time_start,
                'time_pred': time_pred - time_fit,
                'score': accuracy_score(y_test, y_pred)
            })

            logger.info('pca.n_components_: {0}, score: {1}'.format(
                pca_n_components, results_dict_job['score']))

            with open(filepath, 'a') as f:
                writer = csv.DictWriter(f, fieldnames=results_dict_job.keys())
                writer.writerow(results_dict_job)
    except MemoryError as e:
        logger.error('Memory error: {0}'.format(e))
    except PermissionError as e:
        logger.error('Missing privileges: {0}'.format(e))
    except Exception as e:
        logger.error('Unexpected exception: {0}'.format(e))