def run_lda(): kfold = KFold(n_splits=5, shuffle=True, random_state=1) processor = P1() datasets = [Diabetes(), Adult()] estimators = [ Config(name='lda', estimator=LinearDiscriminantAnalysis(), cv=kfold, params={}) ] for dataset in datasets: for estimator in estimators: estimator = processor.get_default_model(dataset=dataset, estimator=estimator) processor.process_validations(dataset=dataset, estimator=estimator) processor.plot_validation() for dataset in datasets: for estimator in estimators: estimator = processor.get_default_model(dataset=dataset, estimator=estimator) processor.param_selection(dataset=dataset, estimator=estimator) processor.print_best_params() for dataset in datasets: for estimator in estimators: processor.process(dataset=dataset, estimator=estimator) processor.plot_learning_curves()
def run_feature_importance(): processor = Processor3() processor.latext_start_figure() for dataset in [Diabetes(), Adult()]: processor = Processor3() processor.latext_start_figure() X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') forest = RandomForestClassifier(n_estimators=500, random_state=1) forest.fit(X_train, y_train) importances = forest.feature_importances_ indices = np.argsort(importances)[::-1] top_10 = [] top_10_vals = [] top_10_idx = [] for f, g in zip(range(X_train.shape[1]), indices[:10]): print("%2d) % -*s %f" % (f + 1, 30, dataset.fields[indices[f]], importances[indices[f]])) top_10.append(dataset.fields[indices[f]]) top_10_idx.append(indices[f]) top_10_vals.append(importances[indices[f]]) print(top_10) print(top_10_idx) plt.title('Feature Importance') plt.bar(top_10, top_10_vals, align='center') plt.xticks(top_10, rotation=90) plt.tight_layout() # plt.show() filename = '%s_%s' % ('features', dataset.__class__.__name__) chart_path = 'report/images/%s.png' % filename plt.savefig(chart_path) plt.close() processor.latex_subgraph(dataset=dataset.__class__.__name__, fig=filename, caption=dataset.__class__.__name__, filename=filename) processor.latex_end_figure(caption=f"Feature Importance", fig=f"feature_importance")
print("Directory '%s' can not be created") parser = argparse.ArgumentParser(description='Find X Coding Quiz') parser.add_argument('-m', '--mode', help='Mode', default='debug', dest='mode') args = parser.parse_args() processor = Processor3() datasets = [ Diabetes(), Adult(), ] run_feature_importance() compute_kmeans_elbow_curves() visualize_kmeans_clusters() run_dimension_reductions() run_nn_opt() run_nn_opt_clusters() ## TODO: Uncomment to get the other charts compute_kmeans_elbow_curves() compute_em_elbow_curves() run_bics() plot_pca_variance()
from sklearn.decomposition import MiniBatchDictionaryLearning from sklearn.decomposition import PCA from sklearn.metrics import adjusted_rand_score from sklearn.metrics import silhouette_score from sklearn.mixture import GaussianMixture from sklearn.pipeline import Pipeline import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from sklearn.random_projection import SparseRandomProjection from hw1.utils import Adult from hw1.utils import Diabetes from hw3.main import Processor3 datasets = [Adult(), Diabetes()] for dataset in datasets: processor = Processor3() processor.latext_start_figure() X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') n_clusters = len(dataset.label_encoder.classes_) pca = PCA(n_components=0.95) pca.fit(X_train) n_components = pca.components_.shape[0] print(f"n_components: {n_components}") dr_models = [ PCA(n_components=n_components, random_state=0), FastICA(n_components=n_components, random_state=0), MiniBatchDictionaryLearning(n_components=n_components, alpha=1,
def run_dimension_reductions(): global mean for dataset in [Diabetes(), Adult()]: processor = Processor3() processor.latext_start_figure() X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') pca = PCA(n_components=0.95) pca.fit(X_train) n_components = pca.components_.shape[0] print(f"n_components: {n_components}") whiten = True random_state = 0 dr_models = [ PCA(n_components=n_components, random_state=0), FastICA(n_components=n_components, random_state=0), MiniBatchDictionaryLearning(n_components=n_components, alpha=1, batch_size=200, n_iter=10, random_state=random_state), SparseRandomProjection(random_state=0, n_components=n_components) ] for pca in dr_models: X_train = pd.DataFrame(X_train) y_train = pd.DataFrame(y_train) if isinstance(pca, SparseRandomProjection): X_train_PCA = pca.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index) X_train_PCA_inverse = np.array(X_train_PCA).dot( pca.components_.todense()) X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index) scatterPlot(X_train_PCA, y_train, pca.__class__.__name__) elif isinstance(pca, MiniBatchDictionaryLearning): X_train_PCA = pca.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index) X_train_PCA_inverse = np.array(X_train_PCA).dot( pca.components_) + np.array(X_train.mean(axis=0)) X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index) scatterPlot(X_train_PCA, y_train, pca.__class__.__name__) else: X_train_PCA = pca.fit_transform(X_train) X_train_PCA = pd.DataFrame(data=X_train_PCA, index=X_train.index) X_train_PCA_inverse = pca.inverse_transform(X_train_PCA) X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=X_train.index) scatterPlot(X_train_PCA, y_train, pca.__class__.__name__) # plt.show() anomalyScoresPCA = anomalyScores(X_train, X_train_PCA_inverse) mean = np.mean(anomalyScoresPCA) print(mean) preds = plotResults(y_train, anomalyScoresPCA, True, pca.__class__.__name__, dataset.__class__.__name__, mean) processor.latex_end_figure( caption=f"{dataset.__class__.__name__} Precision-Recall Curve", fig=f"pr_{dataset.__class__.__name__}")