Beispiel #1
0
Compare the traning time and test time of the milestone 1, 2, and 3 methods 
    (plus task 4 - if your team did it). 
Use the average (or mode) runtime over 10 re-runs 
    and perform a suitable statistical test to assess whether one of those performs significantly better 
    than the others w.r.t. efficieny of training and test time. 
"""

#%% 1. Data reading and preprocessing
import pandas as pd
from preprocessing import preproc, setUsedData
# Read data
# Remember to set path
labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv")
trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc(
    labeledData)
usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData(
    'batch', trainX, trainY, testX, testY)

del (trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon)

#%% 6. Try Different CLassifiers
# Set Parameters
maxIter = 1000
tolerance = 1e-3

# Import models
import sklearn.linear_model as lm
from sklearn.svm import SVC
svm = SVC(kernel='rbf')
from sklearn.svm import LinearSVC
svmLinear = LinearSVC()
from sklearn.gaussian_process import GaussianProcessClassifier
[milestone 4 - Final Comparison]
    Compare all methods (at least the 3 milestones) you used throughout the semester 
    using 10 re-runs of a 10-fold cross-validation 
        and perform a suitable statistical test 
        to assess whether one of those performs significantly better than the others.
"""
#%% 1. Data reading and preprocessing
import pandas as pd
from preprocessing import preproc, setUsedData
# Read data
# Remember to set path
labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv")
trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc(
    labeledData)
usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData(
    'all', trainXCon, trainY, testXCon, testY)
#usedTrainXFull = trainX
#usedTestXFull = testX

# Binary
binary = False
if binary:
    type1 = 1
    type2 = 2
    usedTrainX = usedTrainX[(usedTrainY == type1) | (usedTrainY == type2)]
    usedTrainY = usedTrainY[(usedTrainY == type1) | (usedTrainY == type2)]
    usedTestX = usedTestX[(usedTestY == type1) | (usedTestY == type2)]
    usedTestY = usedTestY[(usedTestY == type1) | (usedTestY == type2)]

del (trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon)
    and perform a suitable statistical test 
    to assess whether one of those performs significantly better than the others.
"""

#%% 1. Data reading and preprocessing
import pandas as pd
from preprocessing import preproc, setUsedData
# Read data
# Remember to set path
labeledData = pd.read_csv("../data/kaggle_forest_cover_train.csv")
trainX, trainY, testX, testY, trainXDis, testXDis, trainXCon, testXCon = preproc(
    labeledData)
trainBatchSize = 500
testBatchSize = 100

usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData(
    'batch', trainXCon, trainY, testXCon, testY, trainBatchSize, testBatchSize)

#%% 2. Gaussian Procecss
# Hyperparameters are automatically optimized!
import numpy as np
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, Matern, Exponentiation
from sklearn import tree
cartTree = tree.DecisionTreeClassifier()
from sklearn.ensemble import BaggingClassifier
cartTree_bagging = BaggingClassifier(cartTree,
                                     max_samples=0.7,
                                     max_features=1.0)
# Full kernel list:
# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.gaussian_process
kernelList = [
Beispiel #4
0
#%% 2.1 K-Means
# sklearn.cluster.KMeans
# > http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# K-means Clustering
# > http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_iris.html
import numpy as np
from sklearn.cluster import KMeans

estimators = [('k_means_iris_7', KMeans(n_clusters=7))]
#('k_means_iris_3', KMeans(n_clusters=3)),
#('k_means_iris_bad_init', KMeans(n_clusters=3, n_init=1,
#                                init='random'))]
fignum = 1
titles = ['7 clusters']
usedTrainX, usedTrainY, usedTestX, usedTestY, usedTitle = setUsedData(
    'batch', trainX, trainY, testX, testY, trainBatchSize, testBatchSize)
for name, est in estimators:
    est.fit(usedTrainX)
    labels = est.labels_

# acc = np.sum(labels==usedTrainY)/trainBatchSize
# Order is different!

#%% 2.2 K_Means Visualization
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


# Compress 4 binary area features into single 'discrete' feature for visualization
def compressWildArea(usedTrainX):
    usedTrainX['Wilderness_Area'] = usedTrainX['Wilderness_Area1']*0 +\