def main(): trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv') testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv') train_X = pd.read_csv(trainfilename) print "Basic info on training data" print len(train_X) print len(train_X.columns) print train_X.columns train_Y = train_X.take([1], axis=1) train_X = train_X.drop('Survived',1) # print train_X.columns test_X = pd.read_csv(testfilename) #Binaries train_X["has_family"] = train_X.apply(family, axis=1) train_X["child"] = train_X.apply(child, axis=1) train_X["smallchild"] = train_X.apply(smallchild, axis=1) train_X["familysize"] = train_X.apply(familysize, axis=1) train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) test_X["has_family"] = test_X.apply(family, axis=1) test_X["child"] = test_X.apply(child, axis=1) test_X["smallchild"] = test_X.apply(smallchild, axis=1) test_X["familysize"] = test_X.apply(familysize, axis=1) test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) #Categorical ==> Use one hot necoding onehot = True if onehot: newcolumns= [] train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port)) test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port)) newcolumns.append("Embarked") train_X["fare2"] = train_X.apply(fare2, axis=1) test_X["fare2"] = test_X.apply(fare2, axis=1) newcolumns.append("fare2") train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name)) test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name)) newcolumns.append("nameinfo") train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) newcolumns.append("Ticket") train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name)) test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name)) newcolumns.append("Name") train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) newcolumns.append("Cabin") train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True) train_X = train_one_hot_X test_X = test_one_hot_X else: train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port)) train_X["fare2"] = train_X["Fare"].apply(fare2, axis=1) train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name), axis=1) train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name)) train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port)) test_X["fare2"] = test_X["Fare"].apply(fare2, axis=1) test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name), axis=1) test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name)) test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit_transform(train_X['Age'], test_X['Age']) print "Finished reading data" train_X = train_X.fillna(-1) test_X = test_X.fillna(-1) # print train_X if False: for column in train_X.columns: print column, train_X[column] train_X = train_X.values.astype(np.float) test_X = test_X.values.astype(np.float) target = np.ravel(train_Y.values.astype(np.float)) trees = ExtraTreesClassifier(n_estimators=100,bootstrap=True,oob_score=True) trees.fit(train_X, target) pd.DataFrame(trees.feature_importances_).plot(kind='bar') selected_features = np.where(trees.feature_importances_ > 0.02)[0] #0.005)[0] #0.005 #[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15] #0.01 #[ 0 1 3 4 5 6 7 8 9 10 11 12 13 15] #0.02 #[ 0 1 3 4 5 6 7 8 9 10 12 13 15] #0.05 # [0 1 3 4 8] print selected_features train_selected_X = train_X[:, selected_features] test_selected_X = test_X[:, selected_features] # random forest code clf = svm.SVC() kernels = ['poly'] #, 'rbf', 'sigmoid'] degs=[2,3] # gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.] gammas = [1e-3]#, 1e-3, 1e-1, 1.] gs = GridSearchCV(clf, {'kernel': kernels,'degree': degs, 'gamma': gammas}, scoring = 'accuracy', cv = 10) gs.fit(train_selected_X, target) print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_) print('fitting the model') clf = svm.SVC(**gs.best_params_) clf.fit(train_selected_X, target) # run model against test data predicted_class = clf.predict(test_selected_X) # print predicted_class[0:9] # print(len(predicted_class)) predicted_class = ["%d,%d" % (test_selected_X[i,0], predicted_class[i]) for i in range(len(predicted_class))] current = strftime("%Y%m%d", gmtime()) csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "svm_0.02_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived']) #0.02 #Trying sig, rbg, poly with degree 3 on 1e-3 and 1e-4 # Score 0.760942760943 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3} all kernels degree 3 # real 408m13.291s #Score on kaggle 0.74163 # all polynomial 1-4 # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2} # real 1077m38.691s #Score on kaggle 0.76555 #0.05 for comparison # all polynomial 1-4 # Score 0.763187429854 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 3} # real 340m42.558s # Your submission scored 0.74641, #Fixed features and implemented one hot coding # 0.02 polynomial 2 and 3 with 1e-3 # real 283m12.476s # Score 0.772166105499 with params {'kernel': 'poly', 'gamma': 0.001, 'degree': 2} # our submission scored 0.75598, print ('Finished. Exiting.')
def main(): current = strftime("%Y%m%d", gmtime()) trainfilename = os.path.join(os.path.dirname(__file__), 'data', 'train.csv') testfilename = os.path.join(os.path.dirname(__file__), 'data', 'test.csv') train_X = pd.read_csv(trainfilename) print "Basic info on training data" print len(train_X) print len(train_X.columns) print train_X.columns train_Y = train_X.take([1], axis=1) # print train_X.columns orig_test_X = pd.read_csv(testfilename) test_X = orig_test_X #Binaries train_X["has_family"] = train_X.apply(family, axis=1) train_X["child"] = train_X.apply(child, axis=1) train_X["smallchild"] = train_X.apply(smallchild, axis=1) train_X["familysize"] = train_X.apply(familysize, axis=1) train_X["Sex"] = train_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) test_X["has_family"] = test_X.apply(family, axis=1) test_X["child"] = test_X.apply(child, axis=1) test_X["smallchild"] = test_X.apply(smallchild, axis=1) test_X["familysize"] = test_X.apply(familysize, axis=1) test_X["Sex"] = test_X["Sex"].apply(lambda sex: 0 if sex == "male" else 1) #Computations train_X["Embarked"] = train_X["Embarked"].apply(lambda port: selectembarkment(port)) train_X["fare2"] = train_X["Fare"].apply(lambda fare: fare2(fare)) train_X["nameinfo"] = train_X["Name"].apply(lambda name: nameinfo(name)) train_X["Ticket"] = train_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) train_X["Name"] = train_X["Name"].apply(lambda name: nameparser(name)) train_X["Cabin"] = train_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) test_X["Embarked"] = test_X["Embarked"].apply(lambda port: selectembarkment(port)) test_X["fare2"] = test_X["Fare"].apply(lambda fare: fare2(fare)) test_X["nameinfo"] = test_X["Name"].apply(lambda name: nameinfo(name)) test_X["Ticket"] = test_X["Ticket"].apply(lambda ticket: DeptCode(ticket)) test_X["Name"] = test_X["Name"].apply(lambda name: nameparser(name)) test_X["Cabin"] = test_X["Cabin"].apply(lambda cabin: cabinparser(cabin)) for name in train_X['Name'].unique(): print "For name " + str(name) imp = Imputer(missing_values='NaN', strategy='mean', axis=1) features = pd.concat([train_X[train_X['Name'] == name]['Age'],test_X[test_X['Name'] == name]['Age']]).values imp.fit(features) features = train_X[train_X['Name'] == name]['Age'].values train_X.loc[train_X.Name == name,'Age'] = list(imp.transform(features)[0]) print np.std(train_X[train_X['Name'] == name]['Age']) print np.mean(train_X[train_X['Name'] == name]['Age']) features = test_X[test_X['Name'] == name]['Age'].values test_X.loc[test_X.Name == name,'Age'] = list(imp.transform(features)[0]) print np.std(test_X[test_X['Name'] == name]['Age']) print np.mean(test_X[test_X['Name'] == name]['Age']) train_X["woman_child_man"] = train_X.apply(lambda row: woman_child_or_man(row), axis=1) test_X["woman_child_man"] = test_X.apply(lambda row: woman_child_or_man(row), axis=1) newcolumns= ["Embarked", "fare2", "nameinfo", "Ticket", "Name", "Cabin", "BinnedFare", "BinnedAge", "woman_child_man"] no_bins = 6 #Discretized features if no_bins > 1: bins_and_binned_fare = pd.qcut(train_X.Fare, no_bins, retbins=True) bins=bins_and_binned_fare[1] train_X["BinnedFare"] = bins_and_binned_fare[0] test_X["BinnedFare"] = pd.cut(test_X.Fare, bins) bins_and_binned_age = pd.qcut(train_X.Age, no_bins, retbins=True) bins=bins_and_binned_age[1] train_X["BinnedAge"] = bins_and_binned_age[0] test_X["BinnedAge"] = pd.cut(test_X.Age, bins) train_one_hot_X, test_one_hot_X = one_hot_dataframe(train_X, test_X, newcolumns, replace=True) train_X = train_one_hot_X test_X = test_one_hot_X print train_X.columns print test_X.columns train_X = train_X.drop('PassengerId',1) test_X = test_X.drop('PassengerId',1) # http://triangleinequality.wordpress.com/2013/05/19/machine-learning-with-python-first-steps-munging/ #Age through Imputation performed already# # Fare # #Fare imputation may not help: see http://nbviewer.ipython.org/gist/mwaskom/8224591 train_X.Fare = train_X.Fare.map(lambda x: np.nan if x==0 else x) test_X.Fare = test_X.Fare.map(lambda x: np.nan if x==0 else x) classmeans = pd.concat([train_X, test_X]).pivot_table('Fare', rows='Pclass', aggfunc='median') # classmeans = test_X.pivot_table('Fare', rows='Pclass', aggfunc='mean') train_X.Fare = train_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 ) test_X.Fare = test_X[['Fare', 'Pclass']].apply(lambda x: classmeans[x['Pclass']] if pd.isnull(x['Fare']) else x['Fare'], axis=1 ) # Cabin # train_X.Cabin = train_X.Cabin.fillna('Unknown') test_X.Cabin = test_X.Cabin.fillna('Unknown') modeEmbarked = mode(pd.concat([train_X, test_X]).Embarked)[0][0] train_X.Embarked = train_X.Embarked.fillna(modeEmbarked) test_X.Embarked = test_X.Embarked.fillna(modeEmbarked) #Farebracket feature array http://www.markhneedham.com/blog/2013/10/30/kaggle-titanic-python-pandas-attempt/ fare_ceiling = 40 fare_bracket_size = 10 number_of_price_brackets = fare_ceiling / fare_bracket_size for bucket in range(0, number_of_price_brackets): train_X["fare_bucket_{0}".format(bucket)] = train_X["Fare"].apply(lambda fare: fare_in_bucket(fare, fare_bracket_size, bucket)) test_X["fare_bucket_{0}".format(bucket)] = test_X["Fare"].apply(lambda fare: fare_in_bucket(fare, fare_bracket_size, bucket)) train_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "processed_train_data_est_{0}.csv".format(current))) test_X.to_csv(os.path.join(os.path.dirname(__file__), 'data', "processed_test_data_est_{0}.csv".format(current))) train_X = train_X.drop('Survived',1) print "Finished reading data" #Entry 152 has a non-known fare # test_X['Fare'] = test_X['Fare'].fillna(-1) # print train_X if False: for column in train_X.columns: print column, train_X[column] train = train_X.values.astype(np.float) target = np.ravel(train_Y.values.astype(np.float)) # random forest code forest = RandomForestClassifier(n_estimators = 100) forest = forest.fit(train, target) if True: from sklearn import cross_validation scores = cross_validation.cross_val_score(forest, train, target, cv=10) print scores if False: from sklearn.grid_search import GridSearchCV forest = ExtraTreesClassifier(bootstrap=True,oob_score=True,random_state=42) max_features_choices = [n * 0.1 for n in range(1,10)] n_ests=[100, 200, 500, 1000] gs = GridSearchCV(forest, {'max_features': max_features_choices,'n_estimators': n_ests}, scoring = 'accuracy', cv = 10, n_jobs=-1) gs.fit(train, target) print "Score {0} with params {1}".format(gs.best_score_, gs.best_params_) print('fitting the model') forest = ExtraTreesClassifier(**gs.best_params_) forest.fit(train, target) # run model against test data predicted_class = forest.predict(test_X.values.astype(np.float)) # print predicted_class[0:9] # print(len(predicted_class)) predicted_class = ["%d,%d" % (orig_test_X.values[i,0], predicted_class[i]) for i in range(len(predicted_class))] csv_io.write_delimited_file(os.path.join(os.path.dirname(__file__), 'results', "random_forest_solution_est_{0}.csv".format(current)), predicted_class, header=['PassengerId', 'Survived']) print ('Finished. Exiting.')