#in the future, scale the data by the train set and apply that to test set train_data = ef.fixdataSVM(train_data) test_data = ef.fixdataSVM(test_data) #scale the data train_test = ef.scaleData(train_data, test_data) train_data = train_test[0] test_data = train_test[1] forestResults = [] forest = RandomForestClassifier(n_estimators=81) forest = forest.fit(train_data[0::, 1::].astype(np.float), train_data[0::, 0].astype(np.float)) forestResults = forest.predict(test_data[0::, 1::].astype(np.float)) print 'NF results on training data: ' print ef.compare( forest.predict(train_data[0::, 1::]).astype(np.float), train_data[0::, 0].astype(np.float)) ''' #split into male and female male_train = train_data[train_data[0::,2] == 1, 0::] female_train = train_data[train_data[0::,2] == 0, 0::] #print train_data[0::,2] #print male_train[0::,2] #cross validation cv = KFold(len(train_data), k=5, indices=False) #do a quick forest, iterate five times to get some idea of the range for i in range(5): forest = RandomForestClassifier(n_estimators=101) forest = forest.fit(male_train[0::,1::].astype(np.float),male_train[0::,0].astype(np.float))
#normalize data frame, remove ticket and name, fix cabin to be just the letter #in the future, scale the data by the train set and apply that to test set train_data = ef.fixdataSVM(train_data) test_data = ef.fixdataSVM(test_data) #scale the data train_test = ef.scaleData(train_data, test_data) train_data = train_test[0] test_data = train_test[1] forestResults = [] forest = RandomForestClassifier(n_estimators=81) forest = forest.fit(train_data[0::,1::].astype(np.float),train_data[0::,0].astype(np.float)) forestResults = forest.predict(test_data[0::,1::].astype(np.float)) print 'NF results on training data: ' print ef.compare (forest.predict(train_data[0::,1::]).astype(np.float),train_data[0::,0].astype(np.float)) ''' #split into male and female male_train = train_data[train_data[0::,2] == 1, 0::] female_train = train_data[train_data[0::,2] == 0, 0::] #print train_data[0::,2] #print male_train[0::,2] #cross validation cv = KFold(len(train_data), k=5, indices=False) #do a quick forest, iterate five times to get some idea of the range for i in range(5): forest = RandomForestClassifier(n_estimators=101) forest = forest.fit(male_train[0::,1::].astype(np.float),male_train[0::,0].astype(np.float)) print "the normalized male forest accuracy:"
forest = forest.fit(train_data[0::,1::],\ train_data[0::,0]) print 'Predicting' output = forest.predict(test_data) #Cross-Validate the RF cvScore = [] savedForests = [] cv = StratifiedKFold(train_data[0::,0], 15) for train,test in cv: cvForest = RandomForestClassifier(n_estimators=401).fit(train_data[train,1::].astype(np.float),train_data[train,0].astype(np.float)) savedForests.append(cvForest) thisOutput = cvForest.predict(train_data[test,1::].astype(np.float)) cvScore.append(ef.compare(thisOutput.astype(np.float),train_data[test,0].astype(np.float))) print "CV Scores:" for score in cvScore: print score print "Against the whole training set accuracy:" sfOutput = [] for s in savedForests: thisOutput = s.predict(train_data[0::,1::].astype(np.float)) print ef.compare(thisOutput.astype(np.float),train_data[0::,0].astype(np.float)) sfOutput.append(thisOutput) averageOutput = [] sfOutput = np.array(sfOutput) averageOutput = np.mean(sfOutput[0::].astype(np.int), axis=0)