Ejemplo n.º 1
0
def run_lda():
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    processor = P1()
    datasets = [Diabetes(), Adult()]
    estimators = [
        Config(name='lda',
               estimator=LinearDiscriminantAnalysis(),
               cv=kfold,
               params={})
    ]

    for dataset in datasets:
        for estimator in estimators:
            estimator = processor.get_default_model(dataset=dataset,
                                                    estimator=estimator)
            processor.process_validations(dataset=dataset, estimator=estimator)
            processor.plot_validation()

    for dataset in datasets:
        for estimator in estimators:
            estimator = processor.get_default_model(dataset=dataset,
                                                    estimator=estimator)
            processor.param_selection(dataset=dataset, estimator=estimator)
            processor.print_best_params()

    for dataset in datasets:
        for estimator in estimators:
            processor.process(dataset=dataset, estimator=estimator)
            processor.plot_learning_curves()
Ejemplo n.º 2
0
def run_nn_opt():
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    processor = P1()
    datasets = [Diabetes()]
    # 'random_hill_climb', 'simulated_annealing', 'genetic_alg'
    estimators = [
        Config(name='NN_%s' % title('random_hill_climb'),
               estimator=mlrose.NeuralNetwork(
                   algorithm='random_hill_climb',
                   random_state=1,
                   max_iters=200,
                   hidden_nodes=[64],
                   early_stopping=True,
               ),
               cv=kfold,
               params={'restarts': [0, 10, 20, 30, 40, 50]}),
        Config(name='NN_%s' % title('simulated_annealing'),
               estimator=mlrose.NeuralNetwork(
                   algorithm='simulated_annealing',
                   random_state=1,
                   max_iters=200,
                   hidden_nodes=[64],
                   early_stopping=True,
               ),
               cv=kfold,
               params={'max_iters': [200]}),
        Config(name='NN_%s' % title('genetic_alg'),
               estimator=mlrose.NeuralNetwork(
                   algorithm='genetic_alg',
                   random_state=1,
                   max_iters=200,
                   hidden_nodes=[64],
                   early_stopping=True,
               ),
               cv=kfold,
               params={
                   'pop_size': [100, 200, 300, 400, 500, 600, 700, 800, 900],
                   'mutation_prob':
                   [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
               }),
    ]

    for dataset in datasets:
        for estimator in estimators:
            estimator = processor.get_default_model(dataset=dataset,
                                                    estimator=estimator)
            processor.process_validations(dataset=dataset, estimator=estimator)
            processor.plot_validation()

    for dataset in datasets:
        for estimator in estimators:
            estimator = processor.get_default_model(dataset=dataset,
                                                    estimator=estimator)
            processor.param_selection(dataset=dataset, estimator=estimator)
            processor.print_best_params()

    for dataset in datasets:
        for estimator in estimators:
            processor.process(dataset=dataset, estimator=estimator)
            processor.plot_learning_curves()
Ejemplo n.º 3
0
def run_nn_opt():
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    processor = P1()
    datasets = [Diabetes()]
    estimators = [
        Config(name='NN_%s' % title('pca'),
               estimator=Pipeline([('pca', PCA(n_components=7,
                                               random_state=0)),
                                   ('nn', MLPClassifier(max_iter=1000))]),
               cv=kfold,
               params={}),
        Config(name='NN_%s' % title('ica'),
               estimator=Pipeline([('pca',
                                    FastICA(n_components=7, random_state=0)),
                                   ('nn', MLPClassifier(max_iter=1000))]),
               cv=kfold,
               params={}),
        Config(name='NN_%s' % title('rp'),
               estimator=Pipeline([('pca',
                                    SparseRandomProjection(random_state=0,
                                                           n_components=7)),
                                   ('nn', MLPClassifier(max_iter=1000))]),
               cv=kfold,
               params={}),
        Config(name='NN_%s' % title('dl'),
               estimator=Pipeline([
                   ('pca',
                    MiniBatchDictionaryLearning(n_components=7,
                                                batch_size=200,
                                                n_iter=1000,
                                                random_state=0)),
                   ('nn', MLPClassifier(max_iter=1000))
               ]),
               cv=kfold,
               params={})
    ]

    for dataset in datasets:
        for estimator in estimators:
            processor.process(dataset=dataset,
                              estimator=estimator,
                              control=True)
            processor.plot_learning_curves(control=True)
def run_feature_importance():
    processor = Processor3()
    processor.latext_start_figure()
    for dataset in [Diabetes(), Adult()]:
        processor = Processor3()
        processor.latext_start_figure()
        X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
        forest = RandomForestClassifier(n_estimators=500, random_state=1)
        forest.fit(X_train, y_train)
        importances = forest.feature_importances_

        indices = np.argsort(importances)[::-1]
        top_10 = []
        top_10_vals = []
        top_10_idx = []
        for f, g in zip(range(X_train.shape[1]), indices[:10]):
            print("%2d) % -*s %f" % (f + 1, 30, dataset.fields[indices[f]],
                                     importances[indices[f]]))
            top_10.append(dataset.fields[indices[f]])
            top_10_idx.append(indices[f])
            top_10_vals.append(importances[indices[f]])

        print(top_10)
        print(top_10_idx)

        plt.title('Feature Importance')
        plt.bar(top_10, top_10_vals, align='center')
        plt.xticks(top_10, rotation=90)
        plt.tight_layout()
        # plt.show()
        filename = '%s_%s' % ('features', dataset.__class__.__name__)
        chart_path = 'report/images/%s.png' % filename
        plt.savefig(chart_path)
        plt.close()
        processor.latex_subgraph(dataset=dataset.__class__.__name__,
                                 fig=filename,
                                 caption=dataset.__class__.__name__,
                                 filename=filename)

    processor.latex_end_figure(caption=f"Feature Importance",
                               fig=f"feature_importance")
Ejemplo n.º 5
0
def run_nn_opt_clusters():
    kfold = KFold(n_splits=5, shuffle=True, random_state=1)
    processor = P1()
    datasets = [Diabetes()]

    dr_models = [
        PCA(n_components=7, random_state=0),
        FastICA(n_components=7, random_state=0),
        MiniBatchDictionaryLearning(n_components=7,
                                    batch_size=200,
                                    n_iter=1000,
                                    random_state=0),
        SparseRandomProjection(random_state=0, n_components=7)
    ]

    clustering_models = []
    for i in [2]:
        clustering_models.append(
            KMeans(n_clusters=i,
                   init='k-means++',
                   n_init=10,
                   max_iter=600,
                   random_state=0,
                   tol=0.0001))
        clustering_models.append(
            GaussianMixture(n_components=i,
                            n_init=10,
                            max_iter=600,
                            random_state=0,
                            tol=0.0001))

    configs = {}
    for model in dr_models:
        configs[model.__class__.__name__.lower()] = {}

    for dataset in datasets:
        dataset_name = dataset.__class__.__name__
        X_train, X_test, y_train, y_test, target_names = dataset.get_data(
            model='KMeans')

        for model in dr_models:
            reduction = model.__class__.__name__.lower()
            print(reduction)
            if reduction == 'lineardiscriminantanalysis':
                X_r = model.fit_transform(X_train, y_train)
                X_r_test = model.transform(X_test)
            else:
                X_r = model.fit_transform(X_train)
                X_r_test = model.transform(X_test)

            for clustering_model in clustering_models:
                try:
                    clusters = clustering_model.n_components
                except:
                    clusters = clustering_model.n_clusters

                preds = clustering_model.fit_predict(X_train)
                preds_test = clustering_model.predict(X_test)
                df = pd.DataFrame(X_r)
                df['cluster'] = preds
                df_test = pd.DataFrame(X_r_test)
                df_test['cluster'] = preds_test
                print('done')
                data = {
                    'X_train': df,
                    'X_test': df_test,
                    'y_train': y_train,
                    'y_test': y_test
                }

                estimator = Config(
                    name='NN_%s_%s_%i' %
                    (clustering_model.__class__.__name__.lower(), reduction,
                     clusters),
                    estimator=Pipeline([('nn', MLPClassifier(max_iter=1000))]),
                    cv=kfold,
                    params={'nn__restarts': [10]})

                processor.process(dataset=dataset,
                                  estimator=estimator,
                                  control=True,
                                  data=data)
                processor.plot_learning_curves(control=True, suffix='_part5')
Ejemplo n.º 6
0
    except OSError as error:
        print("Directory '%s' can not be created")

    parser = argparse.ArgumentParser(description='Find X Coding Quiz')

    parser.add_argument('-m',
                        '--mode',
                        help='Mode',
                        default='debug',
                        dest='mode')
    args = parser.parse_args()

    processor = Processor3()

    datasets = [
        Diabetes(),
        Adult(),
    ]

    run_feature_importance()
    compute_kmeans_elbow_curves()
    visualize_kmeans_clusters()
    run_dimension_reductions()
    run_nn_opt()
    run_nn_opt_clusters()

    ## TODO: Uncomment to get the other charts

    compute_kmeans_elbow_curves()
    compute_em_elbow_curves()
    run_bics()
Ejemplo n.º 7
0
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.random_projection import SparseRandomProjection

from hw1.utils import Adult
from hw1.utils import Diabetes
from hw3.main import Processor3

datasets = [Adult(), Diabetes()]

for dataset in datasets:
    processor = Processor3()
    processor.latext_start_figure()
    X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
    n_clusters = len(dataset.label_encoder.classes_)
    pca = PCA(n_components=0.95)
    pca.fit(X_train)
    n_components = pca.components_.shape[0]
    print(f"n_components: {n_components}")
    dr_models = [
        PCA(n_components=n_components, random_state=0),
        FastICA(n_components=n_components, random_state=0),
        MiniBatchDictionaryLearning(n_components=n_components,
                                    alpha=1,
Ejemplo n.º 8
0
def run_dimension_reductions():
    global mean
    for dataset in [Diabetes(), Adult()]:
        processor = Processor3()
        processor.latext_start_figure()
        X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
        pca = PCA(n_components=0.95)
        pca.fit(X_train)
        n_components = pca.components_.shape[0]
        print(f"n_components: {n_components}")

        whiten = True
        random_state = 0
        dr_models = [
            PCA(n_components=n_components, random_state=0),
            FastICA(n_components=n_components, random_state=0),
            MiniBatchDictionaryLearning(n_components=n_components,
                                        alpha=1,
                                        batch_size=200,
                                        n_iter=10,
                                        random_state=random_state),
            SparseRandomProjection(random_state=0, n_components=n_components)
        ]
        for pca in dr_models:
            X_train = pd.DataFrame(X_train)
            y_train = pd.DataFrame(y_train)

            if isinstance(pca, SparseRandomProjection):
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = np.array(X_train_PCA).dot(
                    pca.components_.todense())
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)
            elif isinstance(pca, MiniBatchDictionaryLearning):
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = np.array(X_train_PCA).dot(
                    pca.components_) + np.array(X_train.mean(axis=0))
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)
            else:
                X_train_PCA = pca.fit_transform(X_train)
                X_train_PCA = pd.DataFrame(data=X_train_PCA,
                                           index=X_train.index)
                X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
                X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse,
                                                   index=X_train.index)
                scatterPlot(X_train_PCA, y_train, pca.__class__.__name__)

            # plt.show()

            anomalyScoresPCA = anomalyScores(X_train, X_train_PCA_inverse)
            mean = np.mean(anomalyScoresPCA)
            print(mean)
            preds = plotResults(y_train, anomalyScoresPCA, True,
                                pca.__class__.__name__,
                                dataset.__class__.__name__, mean)
        processor.latex_end_figure(
            caption=f"{dataset.__class__.__name__} Precision-Recall Curve",
            fig=f"pr_{dataset.__class__.__name__}")