def main():
    combined = prepro.get_combined_data()
    print(combined.shape)
    combined = prepro.get_titles(combined)
    combined = prepro.process_age(combined)
    combined = prepro.process_cabin(combined)
    combined = prepro.process_names(combined)
    combined = prepro.process_fares(combined)
    g = lambda x: x+2
    combined['Fare'] = stats.boxcox(combined['Fare'].apply(g))[0]
    combined = prepro.process_embarked(combined)
    combined = prepro.process_sex(combined)
    combined = prepro.process_pclass(combined)
    combined = prepro.process_family(combined)
    combined = prepro.process_ticket(combined)
    train, test, targets = prepro.recover_train_test_target(combined)

    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(train, targets)
    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_
    print(features.sort('importance', ascending=False).head(10))
    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    test_new = model.transform(test)
    
    # As suspected many features are causing multicollinearity, here I am calculating the 
    # value inflation factor for the top selected features and trying to resolve the problem
    # accordingly in the data processing/engineering code (dataPrePro) 
    h = zip(features['feature'][model.get_support(indices=True)],
            calculate_vif_(pd.DataFrame(train_new)))
    for x, y in h:
        print(x," ",y)
    
    train_data = lgb.Dataset(train_new, label=targets)
    
    param_grid = {
    'boosting_type' :['gbdt','dart'],
    'num_leaves': [20,30,50,100],
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40],
    'is_unbalance': [True,False]
    }

    bst = lgb.LGBMClassifier(objective='binary',silent=False)
    gbm = GridSearchCV(bst, param_grid, scoring=accuracy_scorer)
    gbm.fit(train_new, targets)
    
    print('Best score: {}'.format(gbm.best_score_))
    print('Best parameters: {}'.format(gbm.best_params_))
    
    output = gbm.predict(test_new).astype(int)
    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId','Survived']].to_csv('../data/output.csv',index=False)
    print(output)
Esempio n. 2
0
def main():
    combined = prepro.get_combined_data()
    combined = prepro.get_titles(combined)
    combined = prepro.process_age(combined)
    combined = prepro.process_names(combined)
    combined = prepro.process_fares(combined)
    g = lambda x: x + 2
    combined['Fare'] = stats.boxcox(combined['Fare'].apply(g))[0]
    combined = prepro.process_embarked(combined)
    combined = prepro.process_cabin(combined)
    combined = prepro.process_sex(combined)
    combined = prepro.process_pclass(combined)
    combined = prepro.process_family(combined)
    combined = prepro.process_ticket(combined)
    train, test, targets = prepro.recover_train_test_target(combined)

    clf = ExtraTreesClassifier(n_estimators=250, random_state=0)
    clf = clf.fit(train, targets)
    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_
    print(features.sort('importance', ascending=False).head(10))

    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    test_new = model.transform(test)
    print("train_new shape : ", train_new.shape)
    forest = RandomForestClassifier(max_features='sqrt')
    parameter_grid = {
        'max_depth': [4, 5, 6, 7, 8],
        'n_estimators': [150, 200, 210, 240],
        'criterion': ['gini', 'entropy']
    }

    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(forest,
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               scoring=accuracy_scorer)

    grid_search.fit(train_new, targets)

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

    output = grid_search.predict(test_new).astype(int)
    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId', 'Survived']].to_csv('../data/output.csv',
                                                  index=False)
Esempio n. 3
0
def main():
    combined = prepro.get_combined_data()
    combined = prepro.get_titles(combined)
    combined = prepro.process_age(combined)
    combined = prepro.process_names(combined)
    combined = prepro.process_fares(combined)
    combined = prepro.process_embarked(combined)
    combined = prepro.process_cabin(combined)
    combined = prepro.process_sex(combined)
    combined = prepro.process_pclass(combined)
    combined = prepro.process_family(combined)
    combined = prepro.process_ticket(combined)
    combined = prepro.scale_all_features(combined)
    train,test,targets = prepro.recover_train_test_target(combined)
    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(train, targets)
    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_
    print(features.sort(['importance'],ascending=False).head(10))
    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    test_new = model.transform(test)
    lda = LinearDiscriminantAnalysis()
    parameter_grid = {
                     #'tol': [0.0001,0.0005,0.001],
                     'solver': ['svd','lsqr','eigen']
                     }
    
    cross_validation = StratifiedKFold(targets, n_folds=5)
    
    grid_search = GridSearchCV(lda,
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               scoring=accuracy_scorer)
    
    grid_search.fit(train_new, targets)
    
    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
    
    output = grid_search.predict(test_new).astype(int)
    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId','Survived']].to_csv('../data/output.csv',index=False)
Esempio n. 4
0
def main():
    combined = prepro.get_combined_data()
    print(combined.shape)
    combined = prepro.get_titles(combined)
    combined = prepro.process_age(combined)
    combined = prepro.process_cabin(combined)
    combined = prepro.process_names(combined)
    combined = prepro.process_fares(combined)
    g = lambda x: x+2
    combined['Fare'] = stats.boxcox(combined['Fare'].apply(g))[0]
    combined = prepro.process_embarked(combined)
    combined = prepro.process_sex(combined)
    combined = prepro.process_pclass(combined)
    combined = prepro.process_family(combined)
    combined = prepro.process_ticket(combined)
    train, test, targets = prepro.recover_train_test_target(combined)

    sns.distplot(train['Fare'])
    plt.show()

    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(train, targets)
    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_
    print(features.sort('importance', ascending=False).head(10))
    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    test_new = model.transform(test)
    
    # As suspected many features are causing multicollinearity, here I am calculating the 
    # value inflation factor for the top selected features and trying to resolve the problem
    # accordingly in the data processing/engineering code (dataPrePro) 
    h = zip(features['feature'][model.get_support(indices=True)],
            calculate_vif_(pd.DataFrame(train_new)))
    for x, y in h:
        print(x," ",y)
    
    classifiers = [
    #XGBClassifier(),
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

    log_cols = ["Classifier", "Accuracy"]
    log      = pd.DataFrame(columns=log_cols)
    
    for clf in classifiers:
        name = clf.__class__.__name__
        acc = prepro.compute_score(clf, train_new, targets, "accuracy")
        log_entry = pd.DataFrame([[name, acc]], columns=log_cols)
        log = log.append(log_entry)

    plt.xlabel('Accuracy')
    plt.title('Classifier Accuracy')
    
    sns.set_color_codes("muted")
    sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")
    plt.show()
    
    print(log.sort('Accuracy', ascending=False).head(3))
    estimators = []
    estimators.append(('LDA', LinearDiscriminantAnalysis()))
    estimators.append(('RF', RandomForestClassifier(max_depth=8, n_estimators=150,
                                                    criterion='entropy')))
    
    #ensemble = VotingClassifier(estimators)
    #results =  prepro.compute_score(ensemble, train_new, targets, "accuracy")
    #print("Ensemble Accuracy : ",results.mean())
    
    lr = LogisticRegression()
    sclf = StackingClassifier(classifiers=[x[1] for x in estimators], 
                          meta_classifier=lr)

    print('5-fold cross validation:\n')
    estimators.append(('Stack', sclf))
    for label, clf  in estimators:
    
        scores = cross_val_score(clf, train_new, targets, cv=5, scoring='accuracy')
        print("Accuracy: %0.4f (+/- %0.4f) [%s]" % (scores.mean(), scores.std(), label))
    
    candidate_classifier = sclf
    candidate_classifier.fit(train_new, targets)
    output = candidate_classifier.predict(test_new).astype(int)

    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId','Survived']].to_csv('../data/output.csv',index=False)
Esempio n. 5
0
def main():
    combined = prepro.get_combined_data()
    print(combined.shape)
    combined = prepro.get_titles(combined)
    grouped = combined.groupby(['Sex', 'Pclass', 'Title'])
    grouped.median()
    combined["Age"] = combined.groupby(
        ['Sex', 'Pclass',
         'Title'])['Age'].transform(lambda x: x.fillna(x.median()))
    prepro.status("Age")
    combined = prepro.process_names(combined)
    combined = prepro.process_fares(combined)
    combined = prepro.process_embarked(combined)
    combined = prepro.process_cabin(combined)
    combined = prepro.process_sex(combined)
    combined = prepro.process_pclass(combined)
    combined = prepro.process_family(combined)
    combined = prepro.process_ticket(combined)
    combined = prepro.scale_all_features(combined)
    train, test, targets = prepro.recover_train_test_target(combined)

    clf = RandomForestClassifier(n_estimators=200)
    clf = clf.fit(train, targets)
    features = pd.DataFrame()
    features['feature'] = train.columns
    features['importance'] = clf.feature_importances_
    model = SelectFromModel(clf, prefit=True)
    train_new = model.transform(train)
    test_new = model.transform(test)

    # 'activiation': ['identity', 'logistic', 'tanh', 'relu']
    par_grid = {
        'learning_rate_init': [0.05, 0.01, 0.005, 0.001],
        'solver': ['adam', 'lbfgs', 'sgd']
    }

    mlp = MLPClassifier(activation='tanh',
                        early_stopping=True,
                        hidden_layer_sizes=(50, 50),
                        learning_rate='constant',
                        max_iter=200,
                        validation_fraction=0.1,
                        warm_start=False)

    cross_validation = StratifiedKFold(targets, n_folds=5)

    grid_search = GridSearchCV(mlp,
                               param_grid=par_grid,
                               cv=cross_validation,
                               scoring=accuracy_scorer)

    grid_search.fit(train_new, targets)

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

    output = grid_search.predict(test_new).astype(int)

    df_output = pd.DataFrame()
    df_output['PassengerId'] = test['PassengerId']
    df_output['Survived'] = output
    df_output[['PassengerId', 'Survived']].to_csv('../data/output.csv',
                                                  index=False)