Esempio n. 1
0
def main():
    
    trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv')
    testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv')
    train_X = pd.read_csv(trainfilename)
    print "Basic info on training data"
    print len(train_X)
    print len(train_X.columns)
    print train_X.columns
    
    train_Y = train_X.take([1], axis=1)
    train_X = train_X.drop('Survived',1)
    
    # print train_X.columns
    test_X = pd.read_csv(testfilename)


    #Binaries
    train_X["has_family"] = train_X.apply(family, axis=1)
    train_X["child"] = train_X.apply(child, axis=1)
    train_X["smallchild"] = train_X.apply(smallchild, axis=1)
    train_X["familysize"] = train_X.apply(familysize, axis=1)
    train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
        
    test_X["has_family"] = test_X.apply(family, axis=1)
    test_X["child"] = test_X.apply(child, axis=1)
    test_X["smallchild"] = test_X.apply(smallchild, axis=1)    
    test_X["familysize"] = test_X.apply(familysize, axis=1)
    test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)


    #Categorical ==> Use one hot necoding
    onehot = True
    if onehot:
        newcolumns= []
        train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port))
        test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port))
        newcolumns.append("Embarked")
        train_X["fare2"] = train_X.apply(fare2, axis=1)
        test_X["fare2"] = test_X.apply(fare2, axis=1)
        newcolumns.append("fare2")
        train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name))
        test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name))
        newcolumns.append("nameinfo")
        train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
        test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
        newcolumns.append("Ticket")
        train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name))
        test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name))
        newcolumns.append("Name")
        train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin))
        test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin))
        newcolumns.append("Cabin")
        train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True)
        train_X = train_one_hot_X
        test_X = test_one_hot_X
    else:
        train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port))
        train_X["fare2"] = train_X["Fare"].apply(fare2, axis=1)
        train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name), axis=1)
        train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
        train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name))
        train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin))


        test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port))
        test_X["fare2"] = test_X["Fare"].apply(fare2, axis=1)
        test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name), axis=1)
        test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket))    
        test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name))
        test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin))


    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit_transform(train_X['Age'], test_X['Age'])
    
    print "Finished reading data"
    train_X = train_X.fillna(-1)
    test_X = test_X.fillna(-1)

    # print train_X
    if False:
        for column in train_X.columns:
            print column, train_X[column]
    
    
    train_X = train_X.values.astype(np.float)
    test_X = test_X.values.astype(np.float)
    target = np.ravel(train_Y.values.astype(np.float))

    trees = ExtraTreesClassifier(n_estimators=100,bootstrap=True,oob_score=True)
    trees.fit(train_X, target)
    pd.DataFrame(trees.feature_importances_).plot(kind='bar')
    selected_features = np.where(trees.feature_importances_ > 0.02)[0] #0.005)[0]
    #0.005
    #[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15]
    #0.01
    #[ 0  1  3  4  5  6  7  8  9 10 11 12 13 15]
    #0.02
    #[ 0  1  3  4  5  6  7  8  9 10 12 13 15]
    #0.05
    # [0 1 3 4 8]
    
    print selected_features
    
    train_selected_X = train_X[:, selected_features]
    test_selected_X = test_X[:, selected_features]

    # random forest code
    clf = svm.SVC()
    kernels = ['poly'] #, 'rbf', 'sigmoid']
    degs=[2,3]
    # gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
    gammas = [1e-3]#, 1e-3, 1e-1, 1.]
    gs = GridSearchCV(clf, {'kernel': kernels,'degree': degs, 'gamma': gammas}, scoring = 'accuracy', cv = 10)
    gs.fit(train_selected_X, target)
    print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_)

    print('fitting the model')
    clf = svm.SVC(**gs.best_params_)
    clf.fit(train_selected_X, target)

    
    # run model against test data
    predicted_class = clf.predict(test_selected_X)
    # print predicted_class[0:9]
    # print(len(predicted_class))

    predicted_class = ["%d,%d" % (test_selected_X[i,0], predicted_class[i]) for i in range(len(predicted_class))]
    current = strftime("%Y%m%d", gmtime())
    
    csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "svm_0.02_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived'])

    #0.02
    #Trying sig, rbg, poly with degree 3 on 1e-3 and 1e-4
    # Score 0.760942760943 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3} all kernels degree 3
    # real    408m13.291s
    #Score on kaggle 0.74163
    # all polynomial 1-4
    # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2}
    # real 1077m38.691s
    #Score on kaggle 0.76555
    #0.05 for comparison
    # all polynomial 1-4
    # Score 0.763187429854 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3}
    # real    340m42.558s
    # Your submission scored 0.74641,
    #Fixed features and implemented one hot coding
    # 0.02 polynomial 2 and 3 with 1e-3
    # real    283m12.476s
    # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2}
    # our submission scored 0.75598,
    print ('Finished. Exiting.')
Esempio n. 2
0
def main():
    current = strftime("%Y%m%d", gmtime())
    trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv')
    testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv')
    train_X = pd.read_csv(trainfilename)
    print "Basic info on training data"
    print len(train_X)
    print len(train_X.columns)
    print train_X.columns
    
    train_Y = train_X.take([1], axis=1)    
    # print train_X.columns
    orig_test_X = pd.read_csv(testfilename)
    test_X = orig_test_X
    
        
    #Binaries
    train_X["has_family"] = train_X.apply(family, axis=1)
    train_X["child"] = train_X.apply(child, axis=1)
    train_X["smallchild"] = train_X.apply(smallchild, axis=1)
    train_X["familysize"] = train_X.apply(familysize, axis=1)
    train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)
        
    test_X["has_family"] = test_X.apply(family, axis=1)
    test_X["child"] = test_X.apply(child, axis=1)
    test_X["smallchild"] = test_X.apply(smallchild, axis=1)    
    test_X["familysize"] = test_X.apply(familysize, axis=1)
    test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1)

    #Computations
    train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port))
    train_X["fare2"] = train_X["Fare"].apply(lambda fare: fare2(fare))
    train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name))
    train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket))
    train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name))
    train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin))

    test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port))
    test_X["fare2"] = test_X["Fare"].apply(lambda fare: fare2(fare))
    test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name))
    test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket))    
    test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name))
    test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin))
    
    for name in train_X['Name'].unique():
        print "For name " + str(name)
        imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
        features = pd.concat([train_X[train_X['Name'] == name]['Age'],test_X[test_X['Name'] == name]['Age']]).values
        imp.fit(features)
        features = train_X[train_X['Name'] == name]['Age'].values        
        train_X.loc[train_X.Name == name,'Age'] = list(imp.transform(features)[0])        
        print np.std(train_X[train_X['Name'] == name]['Age'])
        print np.mean(train_X[train_X['Name'] == name]['Age'])
        
        features = test_X[test_X['Name'] == name]['Age'].values
        test_X.loc[test_X.Name == name,'Age'] = list(imp.transform(features)[0])
        print np.std(test_X[test_X['Name'] == name]['Age'])
        print np.mean(test_X[test_X['Name'] == name]['Age'])


    train_X["woman_child_man"] = train_X.apply(lambda row: woman_child_or_man(row), axis=1)
    test_X["woman_child_man"] = test_X.apply(lambda row: woman_child_or_man(row), axis=1)


    newcolumns= ["Embarked", "fare2", "nameinfo", "Ticket", "Name", "Cabin", "BinnedFare", "BinnedAge", "woman_child_man"]

    no_bins = 6
    #Discretized features
    if no_bins > 1:
        bins_and_binned_fare = pd.qcut(train_X.Fare, no_bins, retbins=True)
        bins=bins_and_binned_fare[1]
        train_X["BinnedFare"] = bins_and_binned_fare[0]
        test_X["BinnedFare"] = pd.cut(test_X.Fare, bins)

        bins_and_binned_age = pd.qcut(train_X.Age, no_bins, retbins=True)
        bins=bins_and_binned_age[1]
        train_X["BinnedAge"] = bins_and_binned_age[0]
        test_X["BinnedAge"] = pd.cut(test_X.Age, bins)
    
    train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True)
    train_X = train_one_hot_X
    test_X = test_one_hot_X

    
    
    print train_X.columns
    print test_X.columns
    train_X = train_X.drop('PassengerId',1)
    test_X = test_X.drop('PassengerId',1)
    
    # http://triangleinequality.wordpress.com/2013/05/19/machine-learning-with-python-first-steps-munging/
    #Age through Imputation performed already#
    
    # Fare #
    #Fare imputation may not help: see http://nbviewer.ipython.org/gist/mwaskom/8224591
    train_X.Fare = train_X.Fare.map(lambda x: np.nan if x==0 else x)
    test_X.Fare = test_X.Fare.map(lambda x: np.nan if x==0 else x)
    classmeans = pd.concat([train_X, test_X]).pivot_table('Fare', rows='Pclass', aggfunc='median')
    # classmeans = test_X.pivot_table('Fare', rows='Pclass', aggfunc='mean')
    train_X.Fare = train_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )
    test_X.Fare = test_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 )

    # Cabin #
    train_X.Cabin = train_X.Cabin.fillna('Unknown')
    test_X.Cabin = test_X.Cabin.fillna('Unknown')
    modeEmbarked = mode(pd.concat([train_X, test_X]).Embarked)[0][0]
    train_X.Embarked = train_X.Embarked.fillna(modeEmbarked)
    test_X.Embarked = test_X.Embarked.fillna(modeEmbarked)
    
    #Farebracket feature array http://www.markhneedham.com/blog/2013/10/30/kaggle-titanic-python-pandas-attempt/
    fare_ceiling = 40
    fare_bracket_size = 10
    number_of_price_brackets = fare_ceiling / fare_bracket_size
    for bucket in range(0, number_of_price_brackets):    
        train_X["fare_bucket_{0}".format(bucket)] = train_X["Fare"].apply(lambda fare: fare_in_bucket(fare, fare_bracket_size, bucket))
        test_X["fare_bucket_{0}".format(bucket)] = test_X["Fare"].apply(lambda fare: fare_in_bucket(fare, fare_bracket_size, bucket))
    
    
    
    
    train_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "processed_train_data_est_{0}.csv".format(current)))
    test_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "processed_test_data_est_{0}.csv".format(current)))
    train_X = train_X.drop('Survived',1)
    

    print "Finished reading data"
    
    #Entry 152 has a non-known fare
    # test_X['Fare'] = test_X['Fare'].fillna(-1)

    
    # print train_X
    if False:
        for column in train_X.columns:
            print column, train_X[column]
    
    
    train = train_X.values.astype(np.float)
    target = np.ravel(train_Y.values.astype(np.float))

    # random forest code
    forest = RandomForestClassifier(n_estimators = 100)
    forest = forest.fit(train, target)
    if True:
        from sklearn import cross_validation
        scores = cross_validation.cross_val_score(forest, train, target, cv=10)
        print scores
    
    if False:
        from sklearn.grid_search import GridSearchCV
        forest = ExtraTreesClassifier(bootstrap=True,oob_score=True,random_state=42)
        max_features_choices = [n * 0.1 for n in range(1,10)] 
        n_ests=[100, 200, 500, 1000]
        gs = GridSearchCV(forest, {'max_features': max_features_choices,'n_estimators': n_ests}, scoring = 'accuracy', cv = 10, n_jobs=-1)
        gs.fit(train, target)
        print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_)

        print('fitting the model')
        forest = ExtraTreesClassifier(**gs.best_params_)
        forest.fit(train, target)

    
    # run model against test data
    predicted_class = forest.predict(test_X.values.astype(np.float))
    # print predicted_class[0:9]
    # print(len(predicted_class))

    predicted_class = ["%d,%d" % (orig_test_X.values[i,0], predicted_class[i]) for i in range(len(predicted_class))]
    
    csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "random_forest_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived'])

    print ('Finished. Exiting.')