def main(): # Read training set print print 'Reading training set...' train = utils.ReadFile('train.csv', 1) print 'Finished reading...\n' # Preliminary Statistics print 'Preliminary Statistics:' print np.shape(train)[0] - 1, 'people.', np.shape( train)[1] - 2, 'features.' print(train[1:, 1] == '1').sum(), 'survivors.', ( train[1:, 1] == '0').sum(), 'deceased.\n' #Testing id = 10 mask = train[1:, id] == '' #print list(set(tmp)) #print train[1:,id] #print mask.sum() # Map string features to floats print 'Mapping Features to Floats.\n' dictN = {} # modified in call (useful for name feature) dictC = {} # modified in call (useful for cabin feature) dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC) # Class labels lab = np.array([int(h) for h in train[1:, 1]]) # Generate better model for missing Age feature means = np.zeros(len(dictN), dtype=np.float64) dat, means = utils.AgeModel(dat, dictN, means, 1) # Preliminary Plots print 'Generating preliminary scatter plots of data.\n' utils.PrelimPlots(dat, lab) dat = utils.MeanNorm(dat) # ML algorithms print "Choosing best parameters for Gradient Boosting algorithm:" optim = TestGradBoost(dat, lab) # Plotting Learning Curve print "Plotting the learning curve\n" plotLearningCurve(dat, lab, optim) # Read in test set print "Reading in Test Set\n" test = utils.ReadFile('test.csv', 0) # Map to floats testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC) # Make better prediction for missing Age Features testF, means = utils.AgeModel(testF, dictN, means, 0) testF = utils.MeanNorm(testF) # Make prediction print "Making Prediction\n" clf = GradientBoostingClassifier(learning_rate=optim[0], subsample=optim[1]) clf = clf.fit(dat, lab) pred = clf.predict(testF) # Now output prediction print "Outputting Prediction\n" utils.OutputFile(pred, train[0, :2], test[1, 0], 2) print "Done"
def main(): # Read training set print print 'Reading training set...' train = utils.ReadFile('train.csv', 1) print 'Finished reading...\n' # Preliminary Statistics print 'Preliminary Statistics:' print np.shape(train)[0] - 1, 'people.', np.shape( train)[1] - 2, 'features.' print(train[1:, 1] == '1').sum(), 'survivors.', ( train[1:, 1] == '0').sum(), 'deceased.\n' #Testing id = 10 mask = train[1:, id] == '' #print list(set(tmp)) #print train[1:,id] #print mask.sum() # Map string features to floats print 'Mapping Features to Floats.\n' dictN = {} # modified in call (useful for name feature) dictC = {} # modified in call (useful for cabin feature) dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC) # Class labels lab = np.array([int(h) for h in train[1:, 1]]) # Generate better model for missing Age feature #means = np.zeros(len(dictN), dtype = np.float64) #dat, means = utils.AgeModel(dat, dictN, means, 1) mask = dat[:, 2] != -1.0 dat2 = np.zeros((mask.sum(), 9), dtype=np.float64) tar2 = np.zeros(mask.sum(), dtype=np.float64) dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1) # Preliminary Plots print 'Generating preliminary scatter plots of data.\n' utils.PrelimPlots(dat, lab) utils.AgePlots(dat) # Mean Normalization dat = utils.MeanNorm(dat) # ML algorithms print "Choosing best parameters for Random Forest algorithm:" optimRF = TestRandForest(dat, lab) print "Choosing best parameters for Gradient Boosting algorithm:" optimGB = TestGradBoost(dat, lab) print "Choosing best parameters for SVM algorithm:" optimSVM = TestSVM(dat, lab) # Plotting Learning Curve #print "Plotting the learning curve\n" #plotLearningCurve(dat, lab, optim) # Read in test set print "Reading in Test Set\n" test = utils.ReadFile('test.csv', 0) # Map to floats testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC) # Make better prediction for missing Age Features #testF, means = utils.AgeModel(testF, dictN, means, 0) testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0) # Mean Normalization testF = utils.MeanNorm(testF) # Make prediction array predA = np.zeros((len(testF), 3), dtype=np.float64) # Make prediction print "Making Prediction for RF:" clf = RandomForestClassifier(n_estimators=optimRF[0], max_features=optimRF[1], min_samples_split=1) clf = clf.fit(dat, lab) predA[:, 0] = clf.predict(testF) print "Making Prediction for GB:" clf = GradientBoostingClassifier(learning_rate=optimGB[0], subsample=optimGB[1]) clf = clf.fit(dat, lab) predA[:, 1] = clf.predict(testF) # Make prediction print "Making Prediction for SVM:\n" clf = svm.SVC(C=optimSVM) clf = clf.fit(dat, lab) predA[:, 2] = clf.predict(testF) # choose the prediction with the most votes #print predA pred = stats.mode(predA, axis=1)[0] #print pred # Now output prediction print "Outputting Prediction\n" utils.OutputFile(pred, train[0, :2], test[1, 0], 3) print "Done"
def main(): # Read training set print print 'Reading training set...' train = utils.ReadFile('train.csv', 1) print 'Finished reading...\n' # Preliminary Statistics print 'Preliminary Statistics:' print np.shape(train)[0] - 1, 'people.', np.shape(train)[1] - 2, 'features.' print (train[1:,1] == '1').sum(), 'survivors.', (train[1:,1] == '0').sum(), 'deceased.\n' #Testing id = 10 mask = train[1:,id] == '' #print list(set(tmp)) #print train[1:,id] #print mask.sum() # Map string features to floats print 'Mapping Features to Floats.\n' dictN = {} # modified in call (useful for name feature) dictC = {} # modified in call (useful for cabin feature) dat, dictN, dictC = utils.mapToF(train[1:,:], 0, dictN, dictC) # Class labels lab = np.array([int(h) for h in train[1:,1]]) # Generate better model for missing Age feature means = np.zeros(len(dictN), dtype = np.float64) dat, means = utils.AgeModel(dat, dictN, means, 1) # Preliminary Plots print 'Generating preliminary scatter plots of data.\n' utils.PrelimPlots(dat, lab) dat = utils.MeanNorm(dat) # ML algorithms print "Choosing best parameters for SVM algorithm:" optim = TestSVM(dat, lab) # Plotting Learning Curve print "Plotting the learning curve\n" plotLearningCurve(dat, lab, optim) # Read in test set print "Reading in Test Set\n" test = utils.ReadFile('test.csv',0) # Map to floats testF, dictN, dictC = utils.mapToF(test[1:,:], 1, dictN, dictC) # Make better prediction for missing Age Features testF, means = utils.AgeModel(testF, dictN, means, 0) testF = utils.MeanNorm(testF) # Make prediction print "Making Prediction\n" clf = svm.SVC(C = optim) clf = clf.fit(dat, lab) pred = clf.predict(testF) # Now output prediction print "Outputting Prediction\n" utils.OutputFile(pred, train[0,:2], test[1,0], 1) print "Done"
def main(): # Read training set print print 'Reading training set...' train = utils.ReadFile('train.csv', 1) print 'Finished reading...\n' # Preliminary Statistics print 'Preliminary Statistics:' print np.shape(train)[0] - 1, 'people.', np.shape(train)[1] - 2, 'features.' print (train[1:,1] == '1').sum(), 'survivors.', (train[1:,1] == '0').sum(), 'deceased.\n' #Testing id = 10 mask = train[1:,id] == '' #print list(set(tmp)) #print train[1:,id] #print mask.sum() # Map string features to floats print 'Mapping Features to Floats.\n' dictN = {} # modified in call (useful for name feature) dictC = {} # modified in call (useful for cabin feature) dat, dictN, dictC = utils.mapToF(train[1:,:], 0, dictN, dictC) # Class labels lab = np.array([int(h) for h in train[1:,1]]) # Generate better model for missing Age feature #means = np.zeros(len(dictN), dtype = np.float64) #dat, means = utils.AgeModel(dat, dictN, means, 1) mask = dat[:,2] != -1.0 dat2 = np.zeros((mask.sum(),9), dtype = np.float64) tar2 = np.zeros(mask.sum(), dtype = np.float64) dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1) # testing #mask = dat[:,2] == -1.0 #print mask.sum() # Preliminary Plots print 'Generating preliminary scatter plots of data.\n' utils.PrelimPlots(dat, lab) utils.AgePlots(dat) #dat = utils.MeanNorm(dat) # ML algorithms print "Choosing best parameters for Random Forest algorithm:" optim = TestRandForest(dat, lab) # Plotting Learning Curve print print "Plotting the learning curve\n" plotLearningCurve(dat, lab, optim) # Where is algorithm failing? print "Where is algorithm failing:\n" whereFailing(dat, lab, optim) # Read in test set print "Reading in Test Set\n" test = utils.ReadFile('test.csv', 0) # Map to floats testF, dictN, dictC = utils.mapToF(test[1:,:], 1, dictN, dictC) # Make better prediction for missing Age Features #testF, means = utils.AgeModel(testF, dictN, means, 0) testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0) # Generate scatter plot for test set utils.plotData(testF, 0) #testF = utils.MeanNorm(testF) # Make prediction print "Making Prediction\n" clf = RandomForestClassifier(n_estimators = optim[0], max_features = optim[1], min_samples_split = 1) clf = clf.fit(dat, lab) pred = clf.predict(testF) # Now output prediction print "Outputting Prediction\n" utils.OutputFile(pred, train[0,:2], test[1,0], 0) print "Done"
def main(): # Read training set print print 'Reading training set...' train = utils.ReadFile('train.csv', 1) print 'Finished reading...\n' # Preliminary Statistics print 'Preliminary Statistics:' print np.shape(train)[0] - 1, 'people.', np.shape( train)[1] - 2, 'features.' print(train[1:, 1] == '1').sum(), 'survivors.', ( train[1:, 1] == '0').sum(), 'deceased.\n' #Testing id = 10 mask = train[1:, id] == '' #print list(set(tmp)) #print train[1:,id] #print mask.sum() # Map string features to floats print 'Mapping Features to Floats.\n' dictN = {} # modified in call (useful for name feature) dictC = {} # modified in call (useful for cabin feature) dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC) # Class labels lab = np.array([int(h) for h in train[1:, 1]]) # Generate better model for missing Age feature #means = np.zeros(len(dictN), dtype = np.float64) #dat, means = utils.AgeModel(dat, dictN, means, 1) mask = dat[:, 2] != -1.0 dat2 = np.zeros((mask.sum(), 9), dtype=np.float64) tar2 = np.zeros(mask.sum(), dtype=np.float64) dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1) # testing #mask = dat[:,2] == -1.0 #print mask.sum() # Preliminary Plots print 'Generating preliminary scatter plots of data.\n' utils.PrelimPlots(dat, lab) utils.AgePlots(dat) #dat = utils.MeanNorm(dat) # ML algorithms print "Choosing best parameters for Random Forest algorithm:" optim = TestRandForest(dat, lab) # Plotting Learning Curve print print "Plotting the learning curve\n" plotLearningCurve(dat, lab, optim) # Where is algorithm failing? print "Where is algorithm failing:\n" whereFailing(dat, lab, optim) # Read in test set print "Reading in Test Set\n" test = utils.ReadFile('test.csv', 0) # Map to floats testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC) # Make better prediction for missing Age Features #testF, means = utils.AgeModel(testF, dictN, means, 0) testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0) # Generate scatter plot for test set utils.plotData(testF, 0) #testF = utils.MeanNorm(testF) # Make prediction print "Making Prediction\n" clf = RandomForestClassifier(n_estimators=optim[0], max_features=optim[1], min_samples_split=1) clf = clf.fit(dat, lab) pred = clf.predict(testF) # Now output prediction print "Outputting Prediction\n" utils.OutputFile(pred, train[0, :2], test[1, 0], 0) print "Done"
def main(): # Read training set print print "Reading training set..." train = utils.ReadFile("train.csv", 1) print "Finished reading...\n" # Preliminary Statistics print "Preliminary Statistics:" print np.shape(train)[0] - 1, "people.", np.shape(train)[1] - 2, "features." print (train[1:, 1] == "1").sum(), "survivors.", (train[1:, 1] == "0").sum(), "deceased.\n" # Testing id = 10 mask = train[1:, id] == "" # print list(set(tmp)) # print train[1:,id] # print mask.sum() # Map string features to floats print "Mapping Features to Floats.\n" dictN = {} # modified in call (useful for name feature) dictC = {} # modified in call (useful for cabin feature) dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC) # Class labels lab = np.array([int(h) for h in train[1:, 1]]) # Generate better model for missing Age feature # means = np.zeros(len(dictN), dtype = np.float64) # dat, means = utils.AgeModel(dat, dictN, means, 1) mask = dat[:, 2] != -1.0 dat2 = np.zeros((mask.sum(), 9), dtype=np.float64) tar2 = np.zeros(mask.sum(), dtype=np.float64) dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1) # Preliminary Plots print "Generating preliminary scatter plots of data.\n" utils.PrelimPlots(dat, lab) utils.AgePlots(dat) # Mean Normalization dat = utils.MeanNorm(dat) # ML algorithms print "Choosing best parameters for Random Forest algorithm:" optimRF = TestRandForest(dat, lab) print "Choosing best parameters for Gradient Boosting algorithm:" optimGB = TestGradBoost(dat, lab) print "Choosing best parameters for SVM algorithm:" optimSVM = TestSVM(dat, lab) # Plotting Learning Curve # print "Plotting the learning curve\n" # plotLearningCurve(dat, lab, optim) # Read in test set print "Reading in Test Set\n" test = utils.ReadFile("test.csv", 0) # Map to floats testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC) # Make better prediction for missing Age Features # testF, means = utils.AgeModel(testF, dictN, means, 0) testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0) # Mean Normalization testF = utils.MeanNorm(testF) # Make prediction array predA = np.zeros((len(testF), 3), dtype=np.float64) # Make prediction print "Making Prediction for RF:" clf = RandomForestClassifier(n_estimators=optimRF[0], max_features=optimRF[1], min_samples_split=1) clf = clf.fit(dat, lab) predA[:, 0] = clf.predict(testF) print "Making Prediction for GB:" clf = GradientBoostingClassifier(learning_rate=optimGB[0], subsample=optimGB[1]) clf = clf.fit(dat, lab) predA[:, 1] = clf.predict(testF) # Make prediction print "Making Prediction for SVM:\n" clf = svm.SVC(C=optimSVM) clf = clf.fit(dat, lab) predA[:, 2] = clf.predict(testF) # choose the prediction with the most votes # print predA pred = stats.mode(predA, axis=1)[0] # print pred # Now output prediction print "Outputting Prediction\n" utils.OutputFile(pred, train[0, :2], test[1, 0], 3) print "Done"