Compare the traning time and test time of the milestone 1, 2, and 3 methods (plus task 4 - if your team did it). Use the average (or mode) runtime over 10 re-runs and perform a suitable statistical test to assess whether one of those performs significantly better than the others w.r.t. efficieny of training and test time. """ #%% 1. Data reading and preprocessing import pandas as pd from preprocessing import preproc, setUsedData # Read data # Remember to set path labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv") trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc( labeledData) usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData( 'batch', trainX, trainY, testX, testY) del (trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon) #%% 6. Try Different CLassifiers # Set Parameters maxIter = 1000 tolerance = 1e-3 # Import models import sklearn.linear_model as lm from sklearn.svm import SVC svm = SVC(kernel='rbf') from sklearn.svm import LinearSVC svmLinear = LinearSVC() from sklearn.gaussian_process import GaussianProcessClassifier
[milestone 4 - Final Comparison] Compare all methods (at least the 3 milestones) you used throughout the semester using 10 re-runs of a 10-fold cross-validation and perform a suitable statistical test to assess whether one of those performs significantly better than the others. """ #%% 1. Data reading and preprocessing import pandas as pd from preprocessing import preproc, setUsedData # Read data # Remember to set path labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv") trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc( labeledData) usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData( 'all', trainXCon, trainY, testXCon, testY) #usedTrainXFull = trainX #usedTestXFull = testX # Binary binary = False if binary: type1 = 1 type2 = 2 usedTrainX = usedTrainX[(usedTrainY == type1) | (usedTrainY == type2)] usedTrainY = usedTrainY[(usedTrainY == type1) | (usedTrainY == type2)] usedTestX = usedTestX[(usedTestY == type1) | (usedTestY == type2)] usedTestY = usedTestY[(usedTestY == type1) | (usedTestY == type2)] del (trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon)
and perform a suitable statistical test to assess whether one of those performs significantly better than the others. """ #%% 1. Data reading and preprocessing import pandas as pd from preprocessing import preproc, setUsedData # Read data # Remember to set path labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv") trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc( labeledData) trainBatchSize = 500 testBatchSize = 100 usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData( 'batch', trainXCon, trainY, testXCon, testY, trainBatchSize, testBatchSize) #%% 2. Gaussian Procecss # Hyperparameters are automatically optimized! import numpy as np from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import ConstantKernel, RBF, Matern, Exponentiation from sklearn import tree cartTree = tree.DecisionTreeClassifier() from sklearn.ensemble import BaggingClassifier cartTree_bagging = BaggingClassifier(cartTree, max_samples=0.7, max_features=1.0) # Full kernel list: # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.gaussian_process kernelList = [
#%% 2.1 K-Means # sklearn.cluster.KMeans # > http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html # K-means Clustering # > http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html import numpy as np from sklearn.cluster import KMeans estimators = [('k_means_iris_7', KMeans(n_clusters=7))] #('k_means_iris_3', KMeans(n_clusters=3)), #('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1, # init='random'))] fignum = 1 titles = ['7 clusters'] usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData( 'batch', trainX, trainY, testX, testY, trainBatchSize, testBatchSize) for name, est in estimators: est.fit(usedTrainX) labels = est.labels_ # acc = np.sum(labels==usedTrainY)/trainBatchSize # Order is different! #%% 2.2 K_Means Visualization import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D # Compress 4 binary area features into single 'discrete' feature for visualization def compressWildArea(usedTrainX): usedTrainX['Wilderness_Area'] = usedTrainX['Wilderness_Area1']*0 +\