plt.plot(keyRange,valLoss,label='Validation Loss')
	plt.plot(keyRange,trLoss,label='Training Loss')
	plt.xlabel(key)
	plt.ylabel("Cross-entropy Loss")
	plt.title("Effects of NN Parameters on Cross-Entropy Loss")
	plt.legend(loc='upper right')
	plt.show()

	return bidx

# Load the data
trainData = '/home/avasbr/Desktop/nnet/train.csv'
trainTargets = '/home/avasbr/Desktop/nnet/trainLabels.csv'
X = dp.read_csv_file(trainData).T
X = dp.normalize_range(X)
tY = dp.read_csv_file(trainTargets)
Y = np.zeros([2,tY.size])
for idx,y in enumerate(Y.T):
	y[tY[idx]] = 1

# Split data into training and validation sets
idx = dp.split_train_validation_test(X,[0.6,0.2,0.2])
Xtr = X[:,idx[0]]
Ytr = Y[:,idx[0]]
Xval = X[:,idx[1]]
Yval = Y[:,idx[1]]
Xte = X[:,idx[2]]
Yte = Y[:,idx[2]]

d = Xtr.shape[0]
Ejemplo n.º 2
0

# Load the data
path = '/home/avasbr/Desktop/kaggle_competitions/london/dataset'
train_data_path = path+'/train.csv'
train_label_path = path+'/trainLabels.csv'
test_data_path = path+'/test.csv'

X_tr = dp.read_csv_file(train_data_path)
X_te = dp.read_csv_file(test_data_path)

y_tr = dp.read_csv_file(train_label_path)
m_tr,d = X_tr.shape
m_te = X_te.shape[0]

X_tr,(mu,s) = dp.normalize_range(X_tr,axis=0)
# C_range = 10.0**np.arange(-2,9)
# gamma_range = 10.0**np.arange(-5,4)
# cv = StratifiedKFold(y=y_tr,n_folds=5)
# param_grid = dict(gamma=gamma_range,C=C_range)
# grid = GridSearchCV(svm.SVC(),param_grid=param_grid,cv=cv)
# grid.fit(X_tr,y_tr)

# print(grid.best_estimator_)

cv_err = []
for idx,(tr_idx,val_idx) in enumerate(dp.cross_val_idx(m_tr)):
	clf = svm.SVC(C=1e6,gamma=0.001)
	clf.fit(X_tr[tr_idx,:],y_tr[tr_idx])
	pred = clf.predict(X_tr[val_idx,:])
	mce = 1.0-np.mean(pred==y_tr[val_idx])
Ejemplo n.º 3
0
import dataproc as dp
import kaggle_london_utils as klu
import numpy as np
import matplotlib.pyplot as plt

path='/home/avasbr/datasets/kaggle/london_dataset'
X_tr,y_tr,m_tr,X_te,m_te,d,k = klu.load_london_dataset(path)
X_tr = dp.normalize_range(X_tr) # normalize the range for everything

# look at individual feature distributions for classes
class_0_idx = np.where(y_tr[0]==1)[0]
class_1_idx = np.where(y_tr[1]==1)[0]
for var_idx in range(d):
	plt.subplot(8,5,var_idx)
	curr_feat_class_0 = X_tr[var_idx,class_0_idx]
	curr_feat_class_1 = X_tr[var_idx,class_1_idx]
	plt.hist(curr_feat_class_0,bins=30)
	plt.hist(curr_feat_class_1,bins=30)
plt.show()
Ejemplo n.º 4
0
import utils

# Load the data
path = '/home/avasbr/Desktop/kaggle_competitions/london/dataset'
train_data_path = path + '/train.csv'
train_label_path = path + '/trainLabels.csv'
test_data_path = path + '/test.csv'

X_tr = dp.read_csv_file(train_data_path)
X_te = dp.read_csv_file(test_data_path)

y_tr = dp.read_csv_file(train_label_path)
m_tr, d = X_tr.shape
m_te = X_te.shape[0]

X_tr, (mu, s) = dp.normalize_range(X_tr, axis=0)
# C_range = 10.0**np.arange(-2,9)
# gamma_range = 10.0**np.arange(-5,4)
# cv = StratifiedKFold(y=y_tr,n_folds=5)
# param_grid = dict(gamma=gamma_range,C=C_range)
# grid = GridSearchCV(svm.SVC(),param_grid=param_grid,cv=cv)
# grid.fit(X_tr,y_tr)

# print(grid.best_estimator_)

cv_err = []
for idx, (tr_idx, val_idx) in enumerate(dp.cross_val_idx(m_tr)):
    clf = svm.SVC(C=1e6, gamma=0.001)
    clf.fit(X_tr[tr_idx, :], y_tr[tr_idx])
    pred = clf.predict(X_tr[val_idx, :])
    mce = 1.0 - np.mean(pred == y_tr[val_idx])