Beispiel #1
0
def main():

    # Read training set
    print
    print 'Reading training set...'
    train = utils.ReadFile('train.csv', 1)
    print 'Finished reading...\n'

    # Preliminary Statistics
    print 'Preliminary Statistics:'
    print np.shape(train)[0] - 1, 'people.', np.shape(
        train)[1] - 2, 'features.'
    print(train[1:, 1] == '1').sum(), 'survivors.', (
        train[1:, 1] == '0').sum(), 'deceased.\n'

    #Testing
    id = 10
    mask = train[1:, id] == ''
    #print list(set(tmp))
    #print train[1:,id]
    #print mask.sum()

    # Map string features to floats
    print 'Mapping Features to Floats.\n'
    dictN = {}  # modified in call (useful for name feature)
    dictC = {}  # modified in call (useful for cabin feature)
    dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC)

    # Class labels
    lab = np.array([int(h) for h in train[1:, 1]])

    # Generate better model for missing Age feature
    means = np.zeros(len(dictN), dtype=np.float64)
    dat, means = utils.AgeModel(dat, dictN, means, 1)

    # Preliminary Plots
    print 'Generating preliminary scatter plots of data.\n'
    utils.PrelimPlots(dat, lab)

    dat = utils.MeanNorm(dat)

    # ML algorithms
    print "Choosing best parameters for Gradient Boosting algorithm:"
    optim = TestGradBoost(dat, lab)

    # Plotting Learning Curve
    print "Plotting the learning curve\n"
    plotLearningCurve(dat, lab, optim)

    # Read in test set
    print "Reading in Test Set\n"
    test = utils.ReadFile('test.csv', 0)

    # Map to floats
    testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC)

    # Make better prediction for missing Age Features
    testF, means = utils.AgeModel(testF, dictN, means, 0)

    testF = utils.MeanNorm(testF)

    # Make prediction
    print "Making Prediction\n"
    clf = GradientBoostingClassifier(learning_rate=optim[0],
                                     subsample=optim[1])
    clf = clf.fit(dat, lab)
    pred = clf.predict(testF)

    # Now output prediction
    print "Outputting Prediction\n"
    utils.OutputFile(pred, train[0, :2], test[1, 0], 2)

    print "Done"
Beispiel #2
0
def main():

    # Read training set
    print
    print 'Reading training set...'
    train = utils.ReadFile('train.csv', 1)
    print 'Finished reading...\n'

    # Preliminary Statistics
    print 'Preliminary Statistics:'
    print np.shape(train)[0] - 1, 'people.', np.shape(
        train)[1] - 2, 'features.'
    print(train[1:, 1] == '1').sum(), 'survivors.', (
        train[1:, 1] == '0').sum(), 'deceased.\n'

    #Testing
    id = 10
    mask = train[1:, id] == ''
    #print list(set(tmp))
    #print train[1:,id]
    #print mask.sum()

    # Map string features to floats
    print 'Mapping Features to Floats.\n'
    dictN = {}  # modified in call (useful for name feature)
    dictC = {}  # modified in call (useful for cabin feature)
    dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC)

    # Class labels
    lab = np.array([int(h) for h in train[1:, 1]])

    # Generate better model for missing Age feature
    #means = np.zeros(len(dictN), dtype = np.float64)
    #dat, means = utils.AgeModel(dat, dictN, means, 1)
    mask = dat[:, 2] != -1.0
    dat2 = np.zeros((mask.sum(), 9), dtype=np.float64)
    tar2 = np.zeros(mask.sum(), dtype=np.float64)
    dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1)

    # Preliminary Plots
    print 'Generating preliminary scatter plots of data.\n'
    utils.PrelimPlots(dat, lab)
    utils.AgePlots(dat)

    # Mean Normalization
    dat = utils.MeanNorm(dat)

    # ML algorithms
    print "Choosing best parameters for Random Forest algorithm:"
    optimRF = TestRandForest(dat, lab)
    print "Choosing best parameters for Gradient Boosting algorithm:"
    optimGB = TestGradBoost(dat, lab)
    print "Choosing best parameters for SVM algorithm:"
    optimSVM = TestSVM(dat, lab)

    # Plotting Learning Curve
    #print "Plotting the learning curve\n"
    #plotLearningCurve(dat, lab, optim)

    # Read in test set
    print "Reading in Test Set\n"
    test = utils.ReadFile('test.csv', 0)

    # Map to floats
    testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC)

    # Make better prediction for missing Age Features
    #testF, means = utils.AgeModel(testF, dictN, means, 0)
    testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0)

    # Mean Normalization
    testF = utils.MeanNorm(testF)

    # Make prediction array
    predA = np.zeros((len(testF), 3), dtype=np.float64)

    # Make prediction
    print "Making Prediction for RF:"
    clf = RandomForestClassifier(n_estimators=optimRF[0],
                                 max_features=optimRF[1],
                                 min_samples_split=1)
    clf = clf.fit(dat, lab)
    predA[:, 0] = clf.predict(testF)

    print "Making Prediction for GB:"
    clf = GradientBoostingClassifier(learning_rate=optimGB[0],
                                     subsample=optimGB[1])
    clf = clf.fit(dat, lab)
    predA[:, 1] = clf.predict(testF)

    # Make prediction
    print "Making Prediction for SVM:\n"
    clf = svm.SVC(C=optimSVM)
    clf = clf.fit(dat, lab)
    predA[:, 2] = clf.predict(testF)

    # choose the prediction with the most votes
    #print predA
    pred = stats.mode(predA, axis=1)[0]
    #print pred

    # Now output prediction
    print "Outputting Prediction\n"
    utils.OutputFile(pred, train[0, :2], test[1, 0], 3)

    print "Done"
Beispiel #3
0
def main():

    # Read training set
    print 
    print 'Reading training set...'
    train = utils.ReadFile('train.csv', 1)
    print 'Finished reading...\n'

    # Preliminary Statistics
    print 'Preliminary Statistics:'
    print np.shape(train)[0] - 1, 'people.', np.shape(train)[1] - 2, 'features.'
    print (train[1:,1] == '1').sum(), 'survivors.', (train[1:,1] == '0').sum(), 'deceased.\n'
    
    #Testing
    id = 10
    mask = train[1:,id] == ''
    #print list(set(tmp))
    #print train[1:,id]
    #print mask.sum()
        
    # Map string features to floats
    print 'Mapping Features to Floats.\n'
    dictN = {} # modified in call (useful for name feature) 
    dictC = {} # modified in call (useful for cabin feature)
    dat, dictN, dictC = utils.mapToF(train[1:,:], 0, dictN, dictC)

    # Class labels
    lab = np.array([int(h) for h in train[1:,1]])

    # Generate better model for missing Age feature
    means = np.zeros(len(dictN), dtype = np.float64)
    dat, means = utils.AgeModel(dat, dictN, means, 1)

    # Preliminary Plots 
    print 'Generating preliminary scatter plots of data.\n'
    utils.PrelimPlots(dat, lab)

    dat = utils.MeanNorm(dat)

    # ML algorithms
    print "Choosing best parameters for SVM algorithm:"
    optim = TestSVM(dat, lab)

    # Plotting Learning Curve
    print "Plotting the learning curve\n"
    plotLearningCurve(dat, lab, optim)

    # Read in test set
    print "Reading in Test Set\n"
    test = utils.ReadFile('test.csv',0)

    # Map to floats
    testF, dictN, dictC = utils.mapToF(test[1:,:], 1, dictN, dictC)

    # Make better prediction for missing Age Features
    testF, means = utils.AgeModel(testF, dictN, means, 0)

    testF = utils.MeanNorm(testF)

    # Make prediction
    print "Making Prediction\n"
    clf = svm.SVC(C = optim)
    clf = clf.fit(dat, lab)
    pred = clf.predict(testF)

    # Now output prediction
    print "Outputting Prediction\n"
    utils.OutputFile(pred, train[0,:2], test[1,0], 1)

    print "Done"
Beispiel #4
0
def main():

    # Read training set
    print 
    print 'Reading training set...'
    train = utils.ReadFile('train.csv', 1)
    print 'Finished reading...\n'

    # Preliminary Statistics
    print 'Preliminary Statistics:'
    print np.shape(train)[0] - 1, 'people.', np.shape(train)[1] - 2, 'features.'
    print (train[1:,1] == '1').sum(), 'survivors.', (train[1:,1] == '0').sum(), 'deceased.\n'
    
    #Testing
    id = 10
    mask = train[1:,id] == ''
    #print list(set(tmp))
    #print train[1:,id]
    #print mask.sum()
        
    # Map string features to floats
    print 'Mapping Features to Floats.\n'
    dictN = {} # modified in call (useful for name feature) 
    dictC = {} # modified in call (useful for cabin feature)
    dat, dictN, dictC = utils.mapToF(train[1:,:], 0, dictN, dictC)

    # Class labels
    lab = np.array([int(h) for h in train[1:,1]])

    # Generate better model for missing Age feature
    #means = np.zeros(len(dictN), dtype = np.float64)
    #dat, means = utils.AgeModel(dat, dictN, means, 1)
    mask = dat[:,2] != -1.0
    dat2 = np.zeros((mask.sum(),9), dtype = np.float64)
    tar2 = np.zeros(mask.sum(), dtype = np.float64)
    dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1)

    # testing
    #mask = dat[:,2] == -1.0
    #print mask.sum()
  
    # Preliminary Plots
    print 'Generating preliminary scatter plots of data.\n'
    utils.PrelimPlots(dat, lab)
    utils.AgePlots(dat)

    #dat = utils.MeanNorm(dat)

    # ML algorithms
    print "Choosing best parameters for Random Forest algorithm:"
    optim = TestRandForest(dat, lab)

    # Plotting Learning Curve
    print 
    print "Plotting the learning curve\n"
    plotLearningCurve(dat, lab, optim)

    # Where is algorithm failing?
    print "Where is algorithm failing:\n"
    whereFailing(dat, lab, optim)

    # Read in test set
    print "Reading in Test Set\n"
    test = utils.ReadFile('test.csv', 0)

    # Map to floats
    testF, dictN, dictC = utils.mapToF(test[1:,:], 1, dictN, dictC)

    # Make better prediction for missing Age Features
    #testF, means = utils.AgeModel(testF, dictN, means, 0)
    testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0)

    # Generate scatter plot for test set
    utils.plotData(testF, 0) 

    #testF = utils.MeanNorm(testF)

    # Make prediction
    print "Making Prediction\n"
    clf = RandomForestClassifier(n_estimators = optim[0], 
                                 max_features = optim[1], 
                                 min_samples_split = 1)
    clf = clf.fit(dat, lab)
    pred = clf.predict(testF)

    # Now output prediction
    print "Outputting Prediction\n"
    utils.OutputFile(pred, train[0,:2], test[1,0], 0)

    print "Done"
Beispiel #5
0
def main():

    # Read training set
    print
    print 'Reading training set...'
    train = utils.ReadFile('train.csv', 1)
    print 'Finished reading...\n'

    # Preliminary Statistics
    print 'Preliminary Statistics:'
    print np.shape(train)[0] - 1, 'people.', np.shape(
        train)[1] - 2, 'features.'
    print(train[1:, 1] == '1').sum(), 'survivors.', (
        train[1:, 1] == '0').sum(), 'deceased.\n'

    #Testing
    id = 10
    mask = train[1:, id] == ''
    #print list(set(tmp))
    #print train[1:,id]
    #print mask.sum()

    # Map string features to floats
    print 'Mapping Features to Floats.\n'
    dictN = {}  # modified in call (useful for name feature)
    dictC = {}  # modified in call (useful for cabin feature)
    dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC)

    # Class labels
    lab = np.array([int(h) for h in train[1:, 1]])

    # Generate better model for missing Age feature
    #means = np.zeros(len(dictN), dtype = np.float64)
    #dat, means = utils.AgeModel(dat, dictN, means, 1)
    mask = dat[:, 2] != -1.0
    dat2 = np.zeros((mask.sum(), 9), dtype=np.float64)
    tar2 = np.zeros(mask.sum(), dtype=np.float64)
    dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1)

    # testing
    #mask = dat[:,2] == -1.0
    #print mask.sum()

    # Preliminary Plots
    print 'Generating preliminary scatter plots of data.\n'
    utils.PrelimPlots(dat, lab)
    utils.AgePlots(dat)

    #dat = utils.MeanNorm(dat)

    # ML algorithms
    print "Choosing best parameters for Random Forest algorithm:"
    optim = TestRandForest(dat, lab)

    # Plotting Learning Curve
    print
    print "Plotting the learning curve\n"
    plotLearningCurve(dat, lab, optim)

    # Where is algorithm failing?
    print "Where is algorithm failing:\n"
    whereFailing(dat, lab, optim)

    # Read in test set
    print "Reading in Test Set\n"
    test = utils.ReadFile('test.csv', 0)

    # Map to floats
    testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC)

    # Make better prediction for missing Age Features
    #testF, means = utils.AgeModel(testF, dictN, means, 0)
    testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0)

    # Generate scatter plot for test set
    utils.plotData(testF, 0)

    #testF = utils.MeanNorm(testF)

    # Make prediction
    print "Making Prediction\n"
    clf = RandomForestClassifier(n_estimators=optim[0],
                                 max_features=optim[1],
                                 min_samples_split=1)
    clf = clf.fit(dat, lab)
    pred = clf.predict(testF)

    # Now output prediction
    print "Outputting Prediction\n"
    utils.OutputFile(pred, train[0, :2], test[1, 0], 0)

    print "Done"
Beispiel #6
0
def main():

    # Read training set
    print
    print "Reading training set..."
    train = utils.ReadFile("train.csv", 1)
    print "Finished reading...\n"

    # Preliminary Statistics
    print "Preliminary Statistics:"
    print np.shape(train)[0] - 1, "people.", np.shape(train)[1] - 2, "features."
    print (train[1:, 1] == "1").sum(), "survivors.", (train[1:, 1] == "0").sum(), "deceased.\n"

    # Testing
    id = 10
    mask = train[1:, id] == ""
    # print list(set(tmp))
    # print train[1:,id]
    # print mask.sum()

    # Map string features to floats
    print "Mapping Features to Floats.\n"
    dictN = {}  # modified in call (useful for name feature)
    dictC = {}  # modified in call (useful for cabin feature)
    dat, dictN, dictC = utils.mapToF(train[1:, :], 0, dictN, dictC)

    # Class labels
    lab = np.array([int(h) for h in train[1:, 1]])

    # Generate better model for missing Age feature
    # means = np.zeros(len(dictN), dtype = np.float64)
    # dat, means = utils.AgeModel(dat, dictN, means, 1)
    mask = dat[:, 2] != -1.0
    dat2 = np.zeros((mask.sum(), 9), dtype=np.float64)
    tar2 = np.zeros(mask.sum(), dtype=np.float64)
    dat, dat2, tar2 = utils.AgeModel2(dat, dat2, tar2, 1)

    # Preliminary Plots
    print "Generating preliminary scatter plots of data.\n"
    utils.PrelimPlots(dat, lab)
    utils.AgePlots(dat)

    # Mean Normalization
    dat = utils.MeanNorm(dat)

    # ML algorithms
    print "Choosing best parameters for Random Forest algorithm:"
    optimRF = TestRandForest(dat, lab)
    print "Choosing best parameters for Gradient Boosting algorithm:"
    optimGB = TestGradBoost(dat, lab)
    print "Choosing best parameters for SVM algorithm:"
    optimSVM = TestSVM(dat, lab)

    # Plotting Learning Curve
    # print "Plotting the learning curve\n"
    # plotLearningCurve(dat, lab, optim)

    # Read in test set
    print "Reading in Test Set\n"
    test = utils.ReadFile("test.csv", 0)

    # Map to floats
    testF, dictN, dictC = utils.mapToF(test[1:, :], 1, dictN, dictC)

    # Make better prediction for missing Age Features
    # testF, means = utils.AgeModel(testF, dictN, means, 0)
    testF, dat2, tar2 = utils.AgeModel2(testF, dat2, tar2, 0)

    # Mean Normalization
    testF = utils.MeanNorm(testF)

    # Make prediction array
    predA = np.zeros((len(testF), 3), dtype=np.float64)

    # Make prediction
    print "Making Prediction for RF:"
    clf = RandomForestClassifier(n_estimators=optimRF[0], max_features=optimRF[1], min_samples_split=1)
    clf = clf.fit(dat, lab)
    predA[:, 0] = clf.predict(testF)

    print "Making Prediction for GB:"
    clf = GradientBoostingClassifier(learning_rate=optimGB[0], subsample=optimGB[1])
    clf = clf.fit(dat, lab)
    predA[:, 1] = clf.predict(testF)

    # Make prediction
    print "Making Prediction for SVM:\n"
    clf = svm.SVC(C=optimSVM)
    clf = clf.fit(dat, lab)
    predA[:, 2] = clf.predict(testF)

    # choose the prediction with the most votes
    # print predA
    pred = stats.mode(predA, axis=1)[0]
    # print pred

    # Now output prediction
    print "Outputting Prediction\n"
    utils.OutputFile(pred, train[0, :2], test[1, 0], 3)

    print "Done"