def main(): combined = prepro.get_combined_data() print(combined.shape) combined = prepro.get_titles(combined) combined = prepro.process_age(combined) combined = prepro.process_cabin(combined) combined = prepro.process_names(combined) combined = prepro.process_fares(combined) g = lambda x: x+2 combined['Fare'] = stats.boxcox(combined['Fare'].apply(g))[0] combined = prepro.process_embarked(combined) combined = prepro.process_sex(combined) combined = prepro.process_pclass(combined) combined = prepro.process_family(combined) combined = prepro.process_ticket(combined) train, test, targets = prepro.recover_train_test_target(combined) clf = RandomForestClassifier(n_estimators=200) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ print(features.sort('importance', ascending=False).head(10)) model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) test_new = model.transform(test) # As suspected many features are causing multicollinearity, here I am calculating the # value inflation factor for the top selected features and trying to resolve the problem # accordingly in the data processing/engineering code (dataPrePro) h = zip(features['feature'][model.get_support(indices=True)], calculate_vif_(pd.DataFrame(train_new))) for x, y in h: print(x," ",y) train_data = lgb.Dataset(train_new, label=targets) param_grid = { 'boosting_type' :['gbdt','dart'], 'num_leaves': [20,30,50,100], 'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40], 'is_unbalance': [True,False] } bst = lgb.LGBMClassifier(objective='binary',silent=False) gbm = GridSearchCV(bst, param_grid, scoring=accuracy_scorer) gbm.fit(train_new, targets) print('Best score: {}'.format(gbm.best_score_)) print('Best parameters: {}'.format(gbm.best_params_)) output = gbm.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId','Survived']].to_csv('../data/output.csv',index=False) print(output)
def main(): combined = prepro.get_combined_data() combined = prepro.get_titles(combined) combined = prepro.process_age(combined) combined = prepro.process_names(combined) combined = prepro.process_fares(combined) g = lambda x: x + 2 combined['Fare'] = stats.boxcox(combined['Fare'].apply(g))[0] combined = prepro.process_embarked(combined) combined = prepro.process_cabin(combined) combined = prepro.process_sex(combined) combined = prepro.process_pclass(combined) combined = prepro.process_family(combined) combined = prepro.process_ticket(combined) train, test, targets = prepro.recover_train_test_target(combined) clf = ExtraTreesClassifier(n_estimators=250, random_state=0) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ print(features.sort('importance', ascending=False).head(10)) model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) test_new = model.transform(test) print("train_new shape : ", train_new.shape) forest = RandomForestClassifier(max_features='sqrt') parameter_grid = { 'max_depth': [4, 5, 6, 7, 8], 'n_estimators': [150, 200, 210, 240], 'criterion': ['gini', 'entropy'] } cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(forest, param_grid=parameter_grid, cv=cross_validation, scoring=accuracy_scorer) grid_search.fit(train_new, targets) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) output = grid_search.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('../data/output.csv', index=False)
def main(): combined = prepro.get_combined_data() combined = prepro.get_titles(combined) combined = prepro.process_age(combined) combined = prepro.process_names(combined) combined = prepro.process_fares(combined) combined = prepro.process_embarked(combined) combined = prepro.process_cabin(combined) combined = prepro.process_sex(combined) combined = prepro.process_pclass(combined) combined = prepro.process_family(combined) combined = prepro.process_ticket(combined) combined = prepro.scale_all_features(combined) train,test,targets = prepro.recover_train_test_target(combined) clf = RandomForestClassifier(n_estimators=200) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ print(features.sort(['importance'],ascending=False).head(10)) model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) test_new = model.transform(test) lda = LinearDiscriminantAnalysis() parameter_grid = { #'tol': [0.0001,0.0005,0.001], 'solver': ['svd','lsqr','eigen'] } cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(lda, param_grid=parameter_grid, cv=cross_validation, scoring=accuracy_scorer) grid_search.fit(train_new, targets) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) output = grid_search.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId','Survived']].to_csv('../data/output.csv',index=False)
def main(): combined = prepro.get_combined_data() print(combined.shape) combined = prepro.get_titles(combined) combined = prepro.process_age(combined) combined = prepro.process_cabin(combined) combined = prepro.process_names(combined) combined = prepro.process_fares(combined) g = lambda x: x+2 combined['Fare'] = stats.boxcox(combined['Fare'].apply(g))[0] combined = prepro.process_embarked(combined) combined = prepro.process_sex(combined) combined = prepro.process_pclass(combined) combined = prepro.process_family(combined) combined = prepro.process_ticket(combined) train, test, targets = prepro.recover_train_test_target(combined) sns.distplot(train['Fare']) plt.show() clf = RandomForestClassifier(n_estimators=200) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ print(features.sort('importance', ascending=False).head(10)) model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) test_new = model.transform(test) # As suspected many features are causing multicollinearity, here I am calculating the # value inflation factor for the top selected features and trying to resolve the problem # accordingly in the data processing/engineering code (dataPrePro) h = zip(features['feature'][model.get_support(indices=True)], calculate_vif_(pd.DataFrame(train_new))) for x, y in h: print(x," ",y) classifiers = [ #XGBClassifier(), KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression()] log_cols = ["Classifier", "Accuracy"] log = pd.DataFrame(columns=log_cols) for clf in classifiers: name = clf.__class__.__name__ acc = prepro.compute_score(clf, train_new, targets, "accuracy") log_entry = pd.DataFrame([[name, acc]], columns=log_cols) log = log.append(log_entry) plt.xlabel('Accuracy') plt.title('Classifier Accuracy') sns.set_color_codes("muted") sns.barplot(x='Accuracy', y='Classifier', data=log, color="b") plt.show() print(log.sort('Accuracy', ascending=False).head(3)) estimators = [] estimators.append(('LDA', LinearDiscriminantAnalysis())) estimators.append(('RF', RandomForestClassifier(max_depth=8, n_estimators=150, criterion='entropy'))) #ensemble = VotingClassifier(estimators) #results = prepro.compute_score(ensemble, train_new, targets, "accuracy") #print("Ensemble Accuracy : ",results.mean()) lr = LogisticRegression() sclf = StackingClassifier(classifiers=[x[1] for x in estimators], meta_classifier=lr) print('5-fold cross validation:\n') estimators.append(('Stack', sclf)) for label, clf in estimators: scores = cross_val_score(clf, train_new, targets, cv=5, scoring='accuracy') print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label)) candidate_classifier = sclf candidate_classifier.fit(train_new, targets) output = candidate_classifier.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId','Survived']].to_csv('../data/output.csv',index=False)
def main(): combined = prepro.get_combined_data() print(combined.shape) combined = prepro.get_titles(combined) grouped = combined.groupby(['Sex', 'Pclass', 'Title']) grouped.median() combined["Age"] = combined.groupby( ['Sex', 'Pclass', 'Title'])['Age'].transform(lambda x: x.fillna(x.median())) prepro.status("Age") combined = prepro.process_names(combined) combined = prepro.process_fares(combined) combined = prepro.process_embarked(combined) combined = prepro.process_cabin(combined) combined = prepro.process_sex(combined) combined = prepro.process_pclass(combined) combined = prepro.process_family(combined) combined = prepro.process_ticket(combined) combined = prepro.scale_all_features(combined) train, test, targets = prepro.recover_train_test_target(combined) clf = RandomForestClassifier(n_estimators=200) clf = clf.fit(train, targets) features = pd.DataFrame() features['feature'] = train.columns features['importance'] = clf.feature_importances_ model = SelectFromModel(clf, prefit=True) train_new = model.transform(train) test_new = model.transform(test) # 'activiation': ['identity', 'logistic', 'tanh', 'relu'] par_grid = { 'learning_rate_init': [0.05, 0.01, 0.005, 0.001], 'solver': ['adam', 'lbfgs', 'sgd'] } mlp = MLPClassifier(activation='tanh', early_stopping=True, hidden_layer_sizes=(50, 50), learning_rate='constant', max_iter=200, validation_fraction=0.1, warm_start=False) cross_validation = StratifiedKFold(targets, n_folds=5) grid_search = GridSearchCV(mlp, param_grid=par_grid, cv=cross_validation, scoring=accuracy_scorer) grid_search.fit(train_new, targets) print('Best score: {}'.format(grid_search.best_score_)) print('Best parameters: {}'.format(grid_search.best_params_)) output = grid_search.predict(test_new).astype(int) df_output = pd.DataFrame() df_output['PassengerId'] = test['PassengerId'] df_output['Survived'] = output df_output[['PassengerId', 'Survived']].to_csv('../data/output.csv', index=False)