Beispiel #1
0
def make_dataframe_from_datasets(datasets: Dict[str, sklearn.utils.Bunch],
                                 force_good: bool = False) -> pd.DataFrame:
    """
    Create a dataframe from the given dataset.

    Parameters
    ----------
    dataset: scikit.utils.Bunch, required
        See https://scikit-learn.org/stable/modules/generated/sklearn.utils.Bunch.html

    force_good: bool, optional, default: False
        If True all dataset samples are marked as good, and they will skip some
        pipeline steps like dewarping an contrast augmentation.

    Returns
    -------
    df
        A new dataframe with the following columns
         * 'filename', the path to image or pdf file,
         * 'target', the encoded values for target names,
         * 'target_name', the target name,
         * 'is_grayscale', whether the image is in greyscale,
         * 'is_good', whether the image is good, i.e., it does not require dewarping,
         * 'is_pdf', whether the image file is in pdf format,
         * 'subset', the dataset subset, one in 'train', 'test'.
    """
    df = pd.DataFrame(columns=[
        'filename', 'target', 'target_name', 'is_grayscale', 'is_good',
        'is_pdf', 'subset', 'is_valid'
    ])

    for subset, dataset in datasets.items():
        subset_df = pd.DataFrame(
            data={
                'filename':
                dataset.filenames,
                'target':
                dataset.target,
                'target_name':
                [dataset.target_names[target] for target in dataset.target],
                'is_grayscale': [False for _ in dataset.filenames],
                'is_good': [
                    is_good(filename, force_good)
                    for filename in dataset.filenames
                ],
                'is_pdf': [is_pdf(filename) for filename in dataset.filenames],
                'is_valid':
                [is_valid(filename) for filename in dataset.filenames],
                'subset': [subset for _ in dataset.filenames]
            })
        df = df.append(subset_df, ignore_index=True)

    drop_invalid_values(df)
    drop_isvalid_column(df)

    return df
Beispiel #2
0
def main(args):
    data_path = 'data/'
    all_models_path = 'models/sk-rf/nature/'
    log_file_path = 'logs/train_regular_rf_all.log'

    logging.basicConfig(
        filename=log_file_path,
        filemode='a',
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        level=logging.INFO,
    )
    logger = logging.getLogger('regular.rf')
    logger.info("Starting training regular sklearn RF models...")

    # command for training one
    '''
    python train_rf_one.py --train data/binary_mnist0
    --test data/binary_mnist0.t
    -m models/rf/nature/sklearn_nature_binary_mnist.pickle
    -b -z -c gini -n 784 --nt 60 -d 8
    '''
    for dataset, fname in datasets.items():
        # track time for each one
        start = time.time()
        train, test = fname
        train_path = data_path + train
        test_path = data_path + test
        model_path = all_models_path + 'sklearn_nature_' + dataset + '.pickle'
        options = ''
        if binary_class[dataset] is True:
            options += '-b '
        if zero_based[dataset] is True:
            options += '-z '
        # add number of features for the dataset
        options += '-n %s ' % n_feat[dataset]
        # use the regular 'best' splitter
        options += '-s best '
        # use gini for now
        options += '-c gini '
        # batch tree number and max max_depth
        options += '--nt %s -d %s' % (tree_size[dataset][0],
                                      tree_size[dataset][1])

        cmd = 'python train_rf_one.py --train %s --test %s -m %s %s' \
            % (train_path, test_path, model_path, options)

        logging.info(cmd)
        os.system(cmd)
        end = time.time()
        logging.info('time in seconds: %f' % (end - start))

    return
Beispiel #3
0
    def test_do_dummy_prediction(self):
        datasets = {
            'breast_cancer': BINARY_CLASSIFICATION,
            'wine': MULTICLASS_CLASSIFICATION,
            'diabetes': REGRESSION,
        }

        for name, task in datasets.items():
            backend_api = self._create_backend('test_do_dummy_prediction')

            X_train, Y_train, X_test, Y_test = putil.get_dataset(name)
            datamanager = XYDataManager(
                X_train,
                Y_train,
                X_test,
                Y_test,
                task=task,
                dataset_name=name,
                feat_type=None,
            )

            auto = autosklearn.automl.AutoML(
                backend_api,
                20,
                5,
                initial_configurations_via_metalearning=25,
                metric=accuracy,
            )
            setup_logger()
            auto._logger = get_logger('test_do_dummy_predictions')
            auto._backend.save_datamanager(datamanager)
            D = backend_api.load_datamanager()

            # Check if data manager is correcly loaded
            self.assertEqual(D.info['task'], datamanager.info['task'])

            auto._do_dummy_prediction(D, 1)

            # Ensure that the dummy predictions are not in the current working
            # directory, but in the temporary directory.
            self.assertFalse(
                os.path.exists(os.path.join(os.getcwd(), '.auto-sklearn')))
            self.assertTrue(
                os.path.exists(
                    os.path.join(backend_api.temporary_directory,
                                 '.auto-sklearn', 'predictions_ensemble',
                                 'predictions_ensemble_1_1_0.0.npy')))

            del auto
            self._tearDown(backend_api.temporary_directory)
            self._tearDown(backend_api.output_directory)
Beispiel #4
0
def loop_cv_score_datasets(clf, datasets, scoring="average_precision", cv=3):
    """
    loops over a set of training data (drop, simple imputation etc) and calculates cross_val_score of clf
    
    e.g. datasets={'drop': ( X_train_drop, y_train_drop), 'simpl.imp.': (X_train_imp, y_train) }
    """

    score_list = []
    for k, v in datasets.items():
        print(k)
        score_list.append(
            score_classifier(clf, v[0], v[1], k, scoring=scoring, cv=cv))

    return (pd.concat(score_list, axis=1))
Beispiel #5
0
    'knn': {'n_neighbors': [1, 3, 5, 7, 10, 20]},
    'tree': {'max_depth': [None, 3, 5, 10]},
    'neural': {'max_iter': [50, 100, 200], 'hidden_layer_sizes': [(15,)]},
    'forest': {'n_estimators': [10, 20, 50, 100]}
}

std_frame = pd.DataFrame(index=classifiers.keys(), columns=['media', 'dp', 'scores'])

dataset_frames = {
    'iris':             std_frame.copy(),
    'digits':           std_frame.copy(),
    'wine':             std_frame.copy(),
    'breast-cancer':    std_frame.copy()
}

for dataset_name, dataset in datasets.items():
    for classifier_name, classifier in classifiers.items():
        gs = GridSearchCV(estimator=classifier, param_grid=params[classifier_name], scoring='accuracy', n_jobs=-1, cv=4)
        scores = cross_val_score(estimator=gs, X=dataset.data, y=dataset.target, scoring='accuracy', cv=10)
        dataset_frames[dataset_name].loc[classifier_name] = [scores.mean(), scores.std(), scores]

for dataset_name, frame in dataset_frames.items():
    print(dataset_name)
    print(frame, end='\n\n')

    boxplot = pd.DataFrame()

    for classifier_name, classifier in classifiers.items():
        boxplot[classifier_name] = frame.at[classifier_name, 'scores']

    sns.boxplot(data=boxplot, showmeans=True)
Beispiel #6
0
def main():

    # results_dir = "./nonstationary"
    results_dir = "./prova"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    logging.basicConfig(filename=os.path.join(results_dir, 'results.log'),
                        level=logging.INFO)

    n_samples = 200
    x_square = np.random.uniform(0, 1, (n_samples, 2))
    x_square_1 = x_square.copy()
    x_square_1[:, 1] = x_square[:, 1] + 2
    x_square_2 = x_square.copy()
    x_square_2[:, 0] = x_square[:, 0] + 2
    x_square_3 = x_square.copy()
    x_square_3[:, 0] = x_square[:, 0] + 2
    x_square_3[:, 1] = x_square[:, 1] + 2
    x_square = np.concatenate([x_square, x_square_1, x_square_2, x_square_3])
    y_square = np.zeros((x_square.shape[0], ), dtype='int')
    L = len(x_square_1)
    y_square[L:2 * L] = 1
    y_square[2 * L:3 * L] = 2
    y_square[3 * L:4 * L] = 3

    # plt.figure()
    # plt.scatter(x_square[:, 0], x_square[:, 1], c=y_square)
    # plt.show()
    # return

    datasets = {
        "square": (x_square, y_square),
    }

    bar_position = 0
    progress_bar = tqdm(datasets.items(), position=bar_position)
    for dataset, data in progress_bar:
        progress_bar.set_description("Analysis of dataset: %s" % dataset)
        X, y = data
        X = StandardScaler().fit_transform(X)

        N = 30
        num_epochs = 400
        lr_dual = 0.0008
        lr_standard = 0.008
        lmb_dual = 0.01
        lmb_standard = 0.01
        repetitions = 10

        kmeans_losses = []
        mlp_losses = []
        mlp_loss_Q = []
        mlp_loss_E = []
        mlp_time = []
        mlp_nodes = []
        trans_losses = []
        trans_loss_Q = []
        trans_loss_E = []
        trans_time = []
        trans_nodes = []
        steps = []
        for i in range(repetitions):
            model_mlp = DeepCompetitiveLayerNonStationary(
                verbose=False,
                lmb=lmb_standard,
                N=N,
                num_epochs=num_epochs,
                lr=lr_standard)
            start_time = time.time()
            model_mlp.fit(X, y)
            mlp_time.append(time.time() - start_time)
            model_mlp.compute_graph()
            model_mlp.plot_graph(
                X, y, os.path.join(results_dir, f"{dataset}_{i}_standard.pdf"))
            model_mlp.plot_graph(
                X, y, os.path.join(results_dir, f"{dataset}_standard.png"))
            mlp_losses.extend(model_mlp.loss_vals)
            mlp_loss_Q.extend(model_mlp.loss_Q_)
            mlp_loss_E.extend(model_mlp.loss_E_)
            mlp_nodes.extend(model_mlp.node_list_)

            model_trans = DeepTopologicalNonstationaryClustering(
                verbose=False,
                lmb=lmb_dual,
                N=N,
                num_epochs=num_epochs,
                lr=lr_dual)
            start_time = time.time()
            model_trans.fit(X, y)
            trans_time.append(time.time() - start_time)
            model_trans.compute_graph()
            model_trans.plot_graph(
                X, y, os.path.join(results_dir, f"{dataset}_{i}_dual.pdf"))
            model_trans.plot_graph(
                X, y, os.path.join(results_dir, f"{dataset}_dual.png"))
            trans_losses.extend(model_trans.loss_vals)
            trans_loss_Q.extend(model_trans.loss_Q_)
            trans_loss_E.extend(model_trans.loss_E_)
            trans_nodes.extend(model_trans.node_list_)

            steps.extend(np.arange(0, len(model_trans.loss_vals)))

        losses = pd.DataFrame({
            'epoch': steps,
            'standard': mlp_losses,
            'dual': trans_losses,
        })

        sns.set_style('whitegrid')
        plt.figure(figsize=[4, 3])
        sns.lineplot('epoch', 'standard', data=losses, label='standard', ci=99)
        sns.lineplot('epoch', 'dual', data=losses, label='dual', ci=99)
        # plt.yscale('log')
        plt.ylabel('loss')
        plt.title(f'{dataset}')
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2)
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f'{dataset}_loss.png'))
        plt.savefig(os.path.join(results_dir, f'{dataset}_loss.pdf'))
        plt.show()

        losses_Q = pd.DataFrame({
            'epoch': steps,
            'standard': mlp_loss_Q,
            'dual': trans_loss_Q,
        })

        sns.set_style('whitegrid')
        plt.figure(figsize=[4, 3])
        sns.lineplot('epoch',
                     'standard',
                     data=losses_Q,
                     label='standard',
                     ci=99)
        sns.lineplot('epoch', 'dual', data=losses_Q, label='dual', ci=99)
        # plt.yscale('log')
        plt.ylabel('quantization error')
        plt.title(f'{dataset}')
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2)
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f'{dataset}_loss_q.png'))
        plt.savefig(os.path.join(results_dir, f'{dataset}_loss_q.pdf'))
        plt.show()

        losses_E = pd.DataFrame({
            'epoch': steps,
            'standard': mlp_loss_E,
            'dual': trans_loss_E,
        })

        sns.set_style('whitegrid')
        plt.figure(figsize=[4, 3])
        sns.lineplot('epoch',
                     'standard',
                     data=losses_E,
                     label='standard',
                     ci=99)
        sns.lineplot('epoch', 'dual', data=losses_E, label='dual', ci=99)
        # plt.yscale('log')
        plt.ylabel('topological complexity')
        plt.title(f'{dataset}')
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2)
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, f'{dataset}_loss_e.png'))
        plt.savefig(os.path.join(results_dir, f'{dataset}_loss_e.pdf'))
        plt.show()

        # nodes = pd.DataFrame({
        #     'epoch': steps,
        #     'standard': mlp_nodes,
        #     'dual': trans_nodes,
        # })
        #
        # sns.set_style('whitegrid')
        # plt.figure(figsize=[4, 3])
        # sns.lineplot('epoch', 'standard', data=nodes, label='standard', ci=99)
        # sns.lineplot('epoch', 'dual', data=nodes, label='dual', ci=99)
        # # plt.yscale('log')
        # plt.ylabel('number of prototypes')
        # plt.title(f'{dataset}')
        # plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.3), ncol=2)
        # plt.tight_layout()
        # plt.savefig(os.path.join(results_dir, f'{dataset}_nodes.png'))
        # plt.savefig(os.path.join(results_dir, f'{dataset}_nodes.pdf'))
        # plt.show()

        logging.info(f'Analyzing dataset: {dataset}')
        logging.info(
            f'Standard competitive layer elapsed time: {np.mean(mlp_time):.2f} +- {np.std(mlp_time):.2f}'
        )
        logging.info(
            f'Dual layer elapsed time: {np.mean(trans_time):.2f} +- {np.std(trans_time):.2f}'
        )
def main():
    logging.basicConfig(filename='log',
                        level=logging.INFO,
                        format='%(asctime)s %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')
    numpy.random.seed(0)

    options = dict(cv=10,
                   scoring=[
                       'accuracy', 'average_precision', 'f1', 'f1_micro',
                       'f1_macro', 'f1_weighted', 'neg_log_loss', 'precision',
                       'recall', 'roc_auc'
                   ],
                   verbose=2,
                   logbook=dict(log=True,
                                filename="results/Log-%d.xlsx" % (time.time()),
                                scores_worksheet="Scores",
                                optimizations_worksheet="Optimizations",
                                estimator_file="models/%s-%s-%s.pkl"))

    logbook = xl.Workbook()
    scores_worksheet = logbook.create_sheet(
        title=options['logbook']['scores_worksheet'])
    optimizations_worksheet = logbook.create_sheet(
        title=options['logbook']['optimizations_worksheet'])

    scores_worksheet.append([
        'Time', 'Dataset', 'Classifier', 'Scoring', 'Scores', 'Mean Score',
        'Std Dev'
    ])
    optimizations_worksheet.append([
        'Time', 'Dataset', 'Classifier', 'Scoring', 'Best Score',
        'Best Params', 'Best Estimator', 'File'
    ])

    datasets = {
        'Heart Disease': {
            'filename': 'datasets/cleveland.csv',
            'drop_na': True,
        },
        'Diabetes': {
            'filename':
            'datasets/diabetes.csv',
            'replace_zero_values':
            ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
        },
    }

    classifiers = {
        'Logistic Regression': {
            'clf':
            linear_model.LogisticRegression(random_state=37,
                                            C=0.13,
                                            penalty='l1'),
            'cv_params':
            dict(C=numpy.arange(0.1, 5, 0.01).tolist(), penalty=['l1', 'l2']),
            'optimize':
            True,
            'random':
            False
        },
        'Linear SVC': {
            'clf': svm.LinearSVC(random_state=37, C=18.09),
            'cv_params': dict(C=numpy.arange(0.1, 50, 0.01).tolist()),
            'optimize': True,
            'random': False
        },
        'Naive Bayes': {
            'clf': naive_bayes.GaussianNB(),
            'cv_params': dict(),
            'optimize': True,
            'random': False
        },
        'K-Nearest Neighbors': {
            'clf':
            neighbors.KNeighborsClassifier(algorithm='brute',
                                           n_jobs=-1,
                                           n_neighbors=13,
                                           weights='uniform'),
            'cv_params':
            dict(n_neighbors=numpy.arange(1, 50).tolist(),
                 weights=['uniform', 'distance']),
            'optimize':
            True,
            'random':
            False
        },
        'MLP Classifier': {
            'clf':
            neural_network.MLPClassifier(
                random_state=37,
                learning_rate_init=0.026958815931057856,
                learning_rate='constant',
                hidden_layer_sizes=(29, 26, 5),
                activation='identity',
                alpha=16.681005372000556,
                max_iter=5000),
            'cv_params':
            dict(hidden_layer_sizes=GenerateNeurons(2),
                 activation=['identity', 'logistic', 'tanh', 'relu'],
                 learning_rate=['constant', 'invscaling', 'adaptive'],
                 alpha=numpy.logspace(-5, 3, 5),
                 learning_rate_init=stats.uniform(0.001, 0.05)),
            'optimize':
            True,
            'random':
            True,
            'random_iterations':
            10000
        },
    }

    try:
        for dataset_name, dataset_options in datasets.items():

            logging.info("Dataset: %s" % (dataset_name))

            dataframe = LoadDataset(dataset_options['filename'])
            standard_scalar, features, predictions = ProcessData(
                dataframe, dataset_options)

            for classifier_name, classifier in classifiers.items():
                logging.info("-- Processing: %s ---" % (classifier_name))

                if not classifier['optimize']:
                    scoring_options = options['scoring'] if type(
                        options['scoring']) is list else [options['scoring']]
                    for scoring in scoring_options:
                        if scoring is 'neg_log_loss' and not hasattr(
                                classifier['clf'], 'predict_proba'):
                            logging.info("--- --- Skipping %s for %s" %
                                         (scoring, classifier_name))
                            continue
                        scores = model_selection.cross_val_score(
                            classifier['clf'],
                            features,
                            predictions,
                            cv=options['cv'],
                            scoring=scoring)
                        scores_worksheet.append([
                            time.time(), dataset_name, classifier_name,
                            scoring,
                            str(scores),
                            scores.mean(),
                            scores.std()
                        ])
                        logging.info("--- --- %.2f%% %s" %
                                     (scores.mean() * 100, scoring))
                else:
                    logging.info("--- --- Optimizing Hyper-Parameters")

                    scoring = options['scoring'][0] if type(
                        options['scoring']) is list else options['scoring']
                    grid = model_selection.GridSearchCV(
                        classifier['clf'],
                        classifier['cv_params'],
                        cv=options['cv'],
                        scoring=scoring,
                        n_jobs=-1,
                        verbose=options['verbose']) if not classifier[
                            'random'] else model_selection.RandomizedSearchCV(
                                classifier['clf'],
                                classifier['cv_params'],
                                n_iter=classifier['random_iterations'],
                                cv=options['cv'],
                                scoring=scoring,
                                n_jobs=-1,
                                verbose=options['verbose'])

                    grid.fit(features, predictions)

                    dump_file = options['logbook']['estimator_file'] % (
                        dataset_name, classifier_name, time.time())
                    joblib.dump(grid.best_estimator_, dump_file)

                    optimizations_worksheet.append([
                        time.time(), dataset_name, classifier_name, scoring,
                        grid.best_score_,
                        str(grid.best_params_),
                        str(grid.best_estimator_), dump_file
                    ])

                    logging.info(
                        "--- --- %.2f%% %s with %s" %
                        (grid.best_score_ * 100, scoring, classifier_name))
                    for key, value in grid.best_params_.items():
                        logging.info("--- --- -- %s: %s" % (key, value))

                logging.info("--- Finished Processing")
    except KeyboardInterrupt:
        logging.error("XXXXX Keyboard Interrupt XXXXX")
    finally:
        logging.info("Saving Worksheet %s" % (options['logbook']['filename']))
        logbook.save(filename=options['logbook']['filename'])
Beispiel #8
0
                                        n_components=n_components)
    methods['MDS'] = manifold.MDS(n_components=n_components,
                                  max_iter=100,
                                  n_init=1)

    k_methods = dict()
    k_methods["K_Isomap"] = drm.k_isomap
    k_methods["K_LLE"] = drm.k_lle
    k_methods["K_LE"] = drm.k_le
    k_methods["K_MDS"] = drm.k_mds

    for label, method in methods.items():
        Y_conv = method.fit_transform(X)
        kstr = "K_" + label
        Y_k = k_methods[kstr](X, n_components, n_neighbors)

        curve_data, area, cname = quality_curve(X, Y_conv, n_neighbors, 'r',
                                                False)
        curve_data2, area2, _ = quality_curve(X, Y_k, n_neighbors, 'r', False)

        draw_curve(curve_data, area, curve_data2, area2, cname, label,
                   data_name, n_neighbors)


#methods
for data_name, X in datasets.items():
    nsamples = len(X)
    n_components = 2
    n_neighbors = 20
    preform_methods(X, data_name, nsamples, n_components, n_neighbors)
Beispiel #9
0
def main():
    results_dir = "cole"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    n_samples = 30
    noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, centers=3, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)
    varied = make_blobs(n_samples=n_samples,
                        cluster_std=[1.0, 2.5, 0.5],
                        random_state=random_state)

    y = np.asarray([i for i in range(1000)])
    x = np.asarray([i % 4 for i in range(1000)])
    X_gabri = np.vstack([x, y]).T
    y_gabri = x
    # plt.figure()
    # sns.scatterplot(X_gabri[:, 0], X_gabri[:, 1], hue=y)
    # plt.show()

    theta = np.radians(np.linspace(30, 360 * 4, n_samples))
    r = theta**2
    x_2 = r * np.cos(theta)
    y_2 = r * np.sin(theta)
    X_spiral = np.vstack([x_2, y_2]).T
    y_spiral = np.zeros(len(X_spiral))
    # plt.figure()
    # plt.scatter(x_2, y_2)
    # plt.show()

    (x_train, y_train), (x_test, y_test) = load_data()
    x_test = np.reshape(x_test,
                        (x_test.shape[0], x_test.shape[1] * x_test.shape[1]))

    x_digits, y_digits = load_digits(return_X_y=True)

    datasets = {
        # "Spiral": (X_spiral, y_spiral),
        # "digits": (x_digits, y_digits),
        # "mnist": (x_test[:5000], y_test[:5000]),
        # "gabri": (X_gabri, y_gabri),
        # "noisy_circles": noisy_circles,
        # "noisy_moons": noisy_moons,
        "blobs": blobs,
        # "aniso": aniso,
        # "varied": varied,
    }

    bar_position = 0
    progress_bar = tqdm(datasets.items(), position=bar_position)
    for dataset, data in progress_bar:
        progress_bar.set_description("Analysis of dataset: %s" % dataset)
        X, y = data

        # if dataset == 'mnist':
        #     mnist_file = os.path.join(results_dir, 'mnist.csv')
        #     if not os.path.isfile(mnist_file):
        #         X2 = X / 255
        #         X3 = np.expand_dims(X2, axis=3)
        #         X4 = np.append(X3, X3, axis=3)
        #         X4 = np.append(X4, X3, axis=3)
        #         XZ = np.zeros((X4.shape[0], 32, 32, 3))
        #         XZ[:, 2:30, 2:30, :] = X4
        #         IMG_SHAPE = (32, 32, 3)
        #         pretrained_model = tf.keras.applications.MobileNet(input_shape=IMG_SHAPE,
        #                                                      include_top=False,
        #                                                      weights='imagenet')
        #         # Unfreeze the base model
        #         pretrained_model.trainable = True
        #         inputs = Input(shape=IMG_SHAPE)
        #         x = pretrained_model(inputs, training=True)
        #         x = GlobalAveragePooling2D()(x)
        #         outputs = Dense(len(set(y)))(x)
        #         pre_model = Model(inputs, outputs)
        #         pre_model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),  # Very low learning rate
        #                       loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        #                       metrics=[tf.keras.metrics.CategoricalAccuracy()])
        #
        #         # Train end-to-end. Be careful to stop before you overfit!
        #         pre_model.fit(XZ, to_categorical(y), epochs=70, batch_size=200)
        #
        #         preds = pretrained_model.predict(XZ)
        #         X = np.reshape(preds, (preds.shape[0], preds.shape[-1]))
        #         Xpd = pd.DataFrame(X)
        #         Xpd.to_csv(mnist_file)
        #
        #     else:
        #         X = pd.read_csv(mnist_file, index_col=0).values
        #
        #     # # Freeze base model
        #     # pretrained_model.trainable = False
        #     # # Create new model on top.
        #     # inputs = Input(shape=IMG_SHAPE)
        #     # x = pretrained_model(inputs, training=False)
        #     # x = GlobalAveragePooling2D()(x)
        #     # outputs = Flatten()(x)
        #     # X = XZ

        X = StandardScaler().fit_transform(X)

        n = X.shape[0]
        d = X.shape[1]
        latent_dim = 2
        k = 3
        lr = 0.008
        epochs = 800
        lbd = 0.01

        # inputs = Input(shape=(d,), name='input')
        # outputs = inputs
        # model = BaseModel(n_features=d, k_prototypes=k, inputs=inputs, outputs=outputs, deep=False)
        # optimizer = tf.keras.optimizers.Adam(learning_rate=0.008)
        # model.compile(optimizer=optimizer)
        # model.summary()
        # model.fit(X, y, epochs=epochs, verbose=True)
        # x_pred = model.predict(X)
        # prototypes = model.base_model.weights[-1].numpy()
        # # G = compute_graph(x_pred, prototypes)
        # # plt.figure()
        # # plot_confusion_matrix(x_pred, prototypes, y)
        # # plt.show()
        # plt.figure()
        # scatterplot(x_pred, prototypes, y, valid=False)
        # plt.savefig(os.path.join(results_dir, dataset + ".png"))
        # plt.show()

        inputs = Input(shape=(d, ), name='input')
        outputs = inputs
        # x = Dense(512)(inputs)
        # outputs = Dense(256)(x)
        # x = Dense(128)(x)
        # outputs = Dense(64)(x)
        model = DualModel(n_samples=n,
                          k_prototypes=k,
                          inputs=inputs,
                          outputs=outputs,
                          deep=False)
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        model.compile(optimizer=optimizer)
        model.summary()
        model.fit(X, y, epochs=epochs)
Beispiel #10
0
def main():

    results_dir = "./paper0"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    logging.basicConfig(filename=os.path.join(results_dir, 'results.log'),
                        level=logging.INFO)

    n_samples = 500
    noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)
    varied = make_blobs(n_samples=n_samples,
                        cluster_std=[1.0, 2.5, 0.5],
                        random_state=random_state)

    y = np.asarray([i for i in range(1000)])
    x = np.asarray([i % 4 for i in range(1000)])
    X_gabri = np.vstack([x, y]).T
    y_gabri = x

    theta = np.radians(np.linspace(30, 360 * 4, n_samples))
    r = theta**2
    x_2 = r * np.cos(theta)
    y_2 = r * np.sin(theta)
    X_spiral = np.vstack([x_2, y_2]).T
    y_spiral = np.zeros(len(X_spiral))
    # plt.figure()
    # plt.scatter(x_2, y_2)
    # plt.show()

    Xl, yl = make_classification(n_samples=n_samples,
                                 n_features=2,
                                 class_sep=25,
                                 n_informative=2,
                                 n_redundant=0,
                                 hypercube=False,
                                 n_classes=4,
                                 n_clusters_per_class=1,
                                 random_state=42)
    Xh, yh = make_classification(n_samples=n_samples,
                                 n_features=3000,
                                 class_sep=8,
                                 n_informative=10,
                                 n_redundant=2990,
                                 hypercube=False,
                                 n_classes=4,
                                 n_clusters_per_class=1,
                                 random_state=42)

    x_digits, y_digits = load_digits(return_X_y=True)

    datasets = {
        # "digits": (x_digits, y_digits),
        # "Spiral": (X_spiral, y_spiral),
        # "Circles": noisy_circles,
        "Moons": noisy_moons,
        # "Blobs (low)": (Xl, yl),
        # "Blobs (high)": (Xh, yh),
        # "gabri": (X_gabri, y_gabri),
        # "Ellipsoids": aniso,
        # "Blobs": blobs,
        # "varied": varied,
    }

    bar_position = 0
    progress_bar = tqdm(datasets.items(), position=bar_position)
    for dataset, data in progress_bar:
        progress_bar.set_description("Analysis of dataset: %s" % dataset)
        X, y = data
        X = StandardScaler().fit_transform(X)

        # u, s, vh = np.linalg.svd(X)
        # print(f'dataset: {dataset} | max s: {np.max(s)} - min s: {np.min(s)}')
        # continue

        ns, nf = X.shape
        k = 40
        epochs = 800
        lr_dual = 0.000008
        lr_base = 0.008
        lmb_dual = 0.01
        lmb_standard = 0.01
        repetitions = 1

        kmeans_losses = []
        mlp_losses = []
        mlp_loss_Q = []
        mlp_loss_E = []
        mlp_time = []
        mlp_nodes = []
        trans_losses = []
        trans_loss_Q = []
        trans_loss_E = []
        trans_time = []
        trans_nodes = []
        steps = []
        for i in range(repetitions):
            inputs = Input(shape=(nf, ), name='input')
            vanilla = BaseModel(n_features=nf,
                                k_prototypes=k,
                                deep=False,
                                inputs=inputs,
                                outputs=inputs)
            optimizer = tf.keras.optimizers.Adam(learning_rate=lr_base)
            vanilla.compile(optimizer=optimizer)
            # model.layers[1].summary()
            vanilla.fit(X, y, epochs=epochs, verbose=False)
            prototypes = vanilla.base_model.weights[-1].numpy()
            # plt.figure(figsize=[4, 3])
            # dynamic_decay(X, model.prototypes_[:200], dim=0, valid=True, scale='log')
            # plt.savefig(f'{dataset}_decayX_vanilla.pdf')
            # plt.savefig(f'{dataset}_decayX_vanilla.png')
            # plt.show()
            # plt.figure(figsize=[4, 3])
            # dynamic_decay(X, vanilla.prototypes_[:40], valid=True, scale='log')
            # plt.savefig(f'{dataset}_decayY_vanilla.pdf')
            # plt.savefig(f'{dataset}_decayY_vanilla.png')
            # plt.show()
            # plt.figure()
            # scatterplot_dynamic(X, model.prototypes_, y, valid=True)
            # plt.savefig(f'{dataset}_dynamic_vanilla.pdf')
            # plt.savefig(f'{dataset}_dynamic_vanilla.png')
            # plt.show()

            # Dual
            print("Dual Model")
            inputs = Input(shape=(nf, ), name='input')
            model = DualModel(n_samples=ns,
                              k_prototypes=k,
                              deep=False,
                              inputs=inputs,
                              outputs=inputs)
            optimizer = tf.keras.optimizers.Adam(learning_rate=lr_dual)
            model.compile(optimizer=optimizer)
            model.fit(X, y, epochs=epochs, verbose=False)
            x_pred = model.predict(X)
            prototypes = model.dual_model.predict(x_pred.T)
            # plt.figure(figsize=[4, 3])
            # dynamic_decay(X, model.prototypes_[:200], dim=0, valid=True, scale='log')
            # plt.savefig(f'{dataset}_decayX_dual.pdf')
            # plt.savefig(f'{dataset}_decayX_dual.png')
            # plt.show()
            plt.figure(figsize=[4, 3])
            dynamic_decay(X,
                          vanilla.prototypes_,
                          is_dual=False,
                          valid=True,
                          scale='log',
                          c='b')
            dynamic_decay(X, model.prototypes_, valid=True, scale='log', c='r')
            plt.savefig(f'{dataset}_decayY_dual.pdf')
            plt.savefig(f'{dataset}_decayY_dual.png')
            plt.show()
Beispiel #11
0
def main():
    results_dir = "results"
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    n_samples = 500
    noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)
    varied = make_blobs(n_samples=n_samples,
                        cluster_std=[1.0, 2.5, 0.5],
                        random_state=random_state)

    y = np.asarray([i for i in range(1000)])
    x = np.asarray([i % 4 for i in range(1000)])
    X_gabri = np.vstack([x, y]).T
    y_gabri = x
    # plt.figure()
    # sns.scatterplot(X_gabri[:, 0], X_gabri[:, 1], hue=y)
    # plt.show()

    (x_train, y_train), (x_test, y_test) = load_data()
    x_test = np.reshape(x_test,
                        (x_test.shape[0], x_test.shape[1] * x_test.shape[1]))

    x_digits, y_digits = load_digits(return_X_y=True)

    datasets = {
        "digits": (x_digits, y_digits),
        "mnist": (x_test, y_test),
        "gabri": (X_gabri, y_gabri),
        "noisy_circles": noisy_circles,
        "noisy_moons": noisy_moons,
        "blobs": blobs,
        "aniso": aniso,
        "varied": varied,
    }
    # for dataset, data in datasets.items():
    #     d = pd.DataFrame(data)
    #     d.to_csv(f"{dataset}.csv")
    # return

    bar_position = 0
    progress_bar = tqdm(datasets.items(), position=bar_position)
    for dataset, data in progress_bar:
        progress_bar.set_description("Analysis of dataset: %s" % dataset)
        X, y = data

        if dataset == 'mnist':
            X2 = X / 255
            X3 = np.expand_dims(X2, axis=3)
            X4 = np.append(X3, X3, axis=3)
            X4 = np.append(X4, X3, axis=3)
            XZ = np.zeros((X4.shape[0], 32, 32, 3))
            XZ[:, 2:30, 2:30, :] = X4
            IMG_SHAPE = (32, 32, 3)
            base_model = tf.keras.applications.MobileNet(input_shape=IMG_SHAPE,
                                                         include_top=False,
                                                         weights='imagenet')
            preds = base_model.predict(XZ)
            preds = np.reshape(preds, (preds.shape[0], preds.shape[-1]))
            X = preds

        X = StandardScaler().fit_transform(X)

        N = 500
        model = DeepTopologicalClustering(verbose=False,
                                          N=N,
                                          num_epochs=400,
                                          lr=0.0008)
        model.fit(X)
        accuracy = model.score(y)

        print(f'Accuracy: {accuracy:.4f}')

        title = f'Accuracy: {accuracy:.4f}'
        model.plot_confusion_matrix(
            y, title, os.path.join(results_dir, f"{dataset}_confmat.pdf"))