Example #1
0
#in the future, scale the data by the train set and apply that to test set
train_data = ef.fixdataSVM(train_data)
test_data = ef.fixdataSVM(test_data)

#scale the data
train_test = ef.scaleData(train_data, test_data)
train_data = train_test[0]
test_data = train_test[1]
forestResults = []
forest = RandomForestClassifier(n_estimators=81)
forest = forest.fit(train_data[0::, 1::].astype(np.float),
                    train_data[0::, 0].astype(np.float))
forestResults = forest.predict(test_data[0::, 1::].astype(np.float))
print 'NF results on training data: '
print ef.compare(
    forest.predict(train_data[0::, 1::]).astype(np.float),
    train_data[0::, 0].astype(np.float))
'''
#split into male and female
male_train = train_data[train_data[0::,2] == 1, 0::]
female_train = train_data[train_data[0::,2] == 0, 0::]
#print train_data[0::,2]
#print male_train[0::,2]

#cross validation
cv = KFold(len(train_data), k=5, indices=False)

#do a quick forest, iterate five times to get some idea of the range
for i in range(5):
	forest = RandomForestClassifier(n_estimators=101)
	forest = forest.fit(male_train[0::,1::].astype(np.float),male_train[0::,0].astype(np.float))
#normalize data frame, remove ticket and name, fix cabin to be just the letter
#in the future, scale the data by the train set and apply that to test set
train_data = ef.fixdataSVM(train_data)
test_data = ef.fixdataSVM(test_data)


#scale the data
train_test = ef.scaleData(train_data, test_data)
train_data = train_test[0]
test_data = train_test[1]
forestResults = []
forest = RandomForestClassifier(n_estimators=81)
forest = forest.fit(train_data[0::,1::].astype(np.float),train_data[0::,0].astype(np.float))
forestResults = forest.predict(test_data[0::,1::].astype(np.float))
print 'NF results on training data: '
print ef.compare (forest.predict(train_data[0::,1::]).astype(np.float),train_data[0::,0].astype(np.float))
'''
#split into male and female
male_train = train_data[train_data[0::,2] == 1, 0::]
female_train = train_data[train_data[0::,2] == 0, 0::]
#print train_data[0::,2]
#print male_train[0::,2]

#cross validation
cv = KFold(len(train_data), k=5, indices=False)

#do a quick forest, iterate five times to get some idea of the range
for i in range(5):
	forest = RandomForestClassifier(n_estimators=101)
	forest = forest.fit(male_train[0::,1::].astype(np.float),male_train[0::,0].astype(np.float))
	print "the normalized male forest accuracy:"
Example #3
0
forest = forest.fit(train_data[0::,1::],\
                    train_data[0::,0])

print 'Predicting'
output = forest.predict(test_data)


#Cross-Validate the RF
cvScore = []
savedForests = []
cv = StratifiedKFold(train_data[0::,0], 15)
for train,test in cv:
	cvForest = RandomForestClassifier(n_estimators=401).fit(train_data[train,1::].astype(np.float),train_data[train,0].astype(np.float))
	savedForests.append(cvForest)
	thisOutput = cvForest.predict(train_data[test,1::].astype(np.float))
	cvScore.append(ef.compare(thisOutput.astype(np.float),train_data[test,0].astype(np.float)))

print "CV Scores:"
for score in cvScore:
	print score

print "Against the whole training set accuracy:"
sfOutput = []
for s in savedForests:
	thisOutput = s.predict(train_data[0::,1::].astype(np.float))
	print ef.compare(thisOutput.astype(np.float),train_data[0::,0].astype(np.float))
	sfOutput.append(thisOutput)
	
averageOutput = []
sfOutput = np.array(sfOutput)
averageOutput =  np.mean(sfOutput[0::].astype(np.int), axis=0)