plt.plot(keyRange,valLoss,label='Validation Loss') plt.plot(keyRange,trLoss,label='Training Loss') plt.xlabel(key) plt.ylabel("Cross-entropy Loss") plt.title("Effects of NN Parameters on Cross-Entropy Loss") plt.legend(loc='upper right') plt.show() return bidx # Load the data trainData = '/home/avasbr/Desktop/nnet/train.csv' trainTargets = '/home/avasbr/Desktop/nnet/trainLabels.csv' X = dp.read_csv_file(trainData).T X = dp.normalize_range(X) tY = dp.read_csv_file(trainTargets) Y = np.zeros([2,tY.size]) for idx,y in enumerate(Y.T): y[tY[idx]] = 1 # Split data into training and validation sets idx = dp.split_train_validation_test(X,[0.6,0.2,0.2]) Xtr = X[:,idx[0]] Ytr = Y[:,idx[0]] Xval = X[:,idx[1]] Yval = Y[:,idx[1]] Xte = X[:,idx[2]] Yte = Y[:,idx[2]] d = Xtr.shape[0]
# Load the data path = '/home/avasbr/Desktop/kaggle_competitions/london/dataset' train_data_path = path+'/train.csv' train_label_path = path+'/trainLabels.csv' test_data_path = path+'/test.csv' X_tr = dp.read_csv_file(train_data_path) X_te = dp.read_csv_file(test_data_path) y_tr = dp.read_csv_file(train_label_path) m_tr,d = X_tr.shape m_te = X_te.shape[0] X_tr,(mu,s) = dp.normalize_range(X_tr,axis=0) # C_range = 10.0**np.arange(-2,9) # gamma_range = 10.0**np.arange(-5,4) # cv = StratifiedKFold(y=y_tr,n_folds=5) # param_grid = dict(gamma=gamma_range,C=C_range) # grid = GridSearchCV(svm.SVC(),param_grid=param_grid,cv=cv) # grid.fit(X_tr,y_tr) # print(grid.best_estimator_) cv_err = [] for idx,(tr_idx,val_idx) in enumerate(dp.cross_val_idx(m_tr)): clf = svm.SVC(C=1e6,gamma=0.001) clf.fit(X_tr[tr_idx,:],y_tr[tr_idx]) pred = clf.predict(X_tr[val_idx,:]) mce = 1.0-np.mean(pred==y_tr[val_idx])
import dataproc as dp import kaggle_london_utils as klu import numpy as np import matplotlib.pyplot as plt path='/home/avasbr/datasets/kaggle/london_dataset' X_tr,y_tr,m_tr,X_te,m_te,d,k = klu.load_london_dataset(path) X_tr = dp.normalize_range(X_tr) # normalize the range for everything # look at individual feature distributions for classes class_0_idx = np.where(y_tr[0]==1)[0] class_1_idx = np.where(y_tr[1]==1)[0] for var_idx in range(d): plt.subplot(8,5,var_idx) curr_feat_class_0 = X_tr[var_idx,class_0_idx] curr_feat_class_1 = X_tr[var_idx,class_1_idx] plt.hist(curr_feat_class_0,bins=30) plt.hist(curr_feat_class_1,bins=30) plt.show()
import utils # Load the data path = '/home/avasbr/Desktop/kaggle_competitions/london/dataset' train_data_path = path + '/train.csv' train_label_path = path + '/trainLabels.csv' test_data_path = path + '/test.csv' X_tr = dp.read_csv_file(train_data_path) X_te = dp.read_csv_file(test_data_path) y_tr = dp.read_csv_file(train_label_path) m_tr, d = X_tr.shape m_te = X_te.shape[0] X_tr, (mu, s) = dp.normalize_range(X_tr, axis=0) # C_range = 10.0**np.arange(-2,9) # gamma_range = 10.0**np.arange(-5,4) # cv = StratifiedKFold(y=y_tr,n_folds=5) # param_grid = dict(gamma=gamma_range,C=C_range) # grid = GridSearchCV(svm.SVC(),param_grid=param_grid,cv=cv) # grid.fit(X_tr,y_tr) # print(grid.best_estimator_) cv_err = [] for idx, (tr_idx, val_idx) in enumerate(dp.cross_val_idx(m_tr)): clf = svm.SVC(C=1e6, gamma=0.001) clf.fit(X_tr[tr_idx, :], y_tr[tr_idx]) pred = clf.predict(X_tr[val_idx, :]) mce = 1.0 - np.mean(pred == y_tr[val_idx])