Beispiel #1
0
def main():
    data_train_file_path = 'Data/train.csv'
    data_test_file_path = 'Data/test_with_label.csv'
    data_types = {
        'Survived': 'Categorical',
        'Pclass': 'Categorical',
        'Name_Affiliation': 'Categorical',
        'Last_Name': 'Text',
        'First_Name': 'Text',
        'Sex': 'Categorical',
        'Age': 'Numerical',
        'SibSp': 'Numerical',
        'Parch': 'Numerical',
        'Ticket_Code': 'Categorical',
        'Ticket_Number': 'Numerical',
        'Fare': 'Numerical',
        'Cabin_Floor': 'Categorical',
        'Cabin_Rooms': 'Text',
        'Embarked': 'Categorical',
        'Family_Members': 'Numerical',
        'is_Traveling_Alone': 'Categorical',
        'Fare_Per_Passenger': 'Categorical',
        'is_Female_with_Children': 'Categorical'
    }
    print('Loading train data...')
    train_data_df = DataPreProcessor.load_data(data_train_file_path)
    print('Loading test data...')
    test_data_df = DataPreProcessor.load_data(data_test_file_path)

    print('Cleaning and transforming train data...')
    cleaned_train_data_df = DataPreProcessor.clean_data(
        train_data_df, data_types)
    print('Cleaning and transforming test data...')
    cleaned_test_data_df = DataPreProcessor.clean_data(test_data_df,
                                                       data_types)

    print('Saving cleaned train data to file...')
    cleaned_train_data_df.to_csv("clean_train.csv", index=False)
    print('Saving cleaned test data to file...')
    cleaned_test_data_df.to_csv("clean_test.csv", index=False)

    eval_classifiers = {
        'TreeClassifier':
        DecisionTreeClassifier(criterion="gini",
                               splitter="best",
                               max_depth=4,
                               random_state=42),
        'AdaBoost':
        AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                           random_state=42),
        'LogisticRegression':
        LogisticRegression(penalty='l1', max_iter=10000, random_state=42),
        'RandomForestClassifier':
        RandomForestClassifier(n_estimators=2000,
                               max_depth=4,
                               min_samples_split=10,
                               random_state=42,
                               class_weight={
                                   0: 0.80,
                                   1: 0.20
                               }),
        'GBTrees':
        GradientBoostingClassifier(max_depth=4,
                                   learning_rate=0.1,
                                   n_estimators=2000,
                                   random_state=42,
                                   min_samples_split=10),
        'xgboost':
        XGBClassifier(max_depth=4,
                      n_estimators=1000,
                      random_state=42,
                      learning_rate=0.1,
                      min_samples_split=10),
        'KNN':
        KNeighborsClassifier(n_neighbors=3, p=2),
        'SVM':
        SVC(gamma='auto', tol=1e-3, C=1.5, random_state=42),
        'GBC':
        GradientBoostingClassifier(n_estimators=1000,
                                   max_depth=4,
                                   learning_rate=0.1)
    }

    eval_classifiers_params_grid = {
        'TreeClassifier': {
            'max_depth': [4, 5, 6]
        },
        'AdaBoost': {
            'n_estimators': [100, 200, 500, 1000, 2000],
            'learning_rate': [0.2, 0.1, 0.05, 0.01]
        },
        'LogisticRegression': {
            'penalty': ['l1', 'l2']
        },
        'RandomForestClassifier': {
            'n_estimators': [100, 200, 500, 1000, 2000],
            'max_depth': [4, 5, 6],
            'max_features': [0.8, 0.5, 0.2, 0.1]
        },
        'GBTrees': {
            'n_estimators': [100, 500, 1000, 2000],
            'max_depth': [4, 5, 6],
            'max_features': [0.8, 0.5, 0.2, 0.1],
            'learning_rate': [0.2, 0.1, 0.05, 0.01]
        },
        'xgboost': {
            'n_estimators': [100, 500, 1000, 2000],
            'max_depth': [4, 5, 6],
            'max_features': [0.8, 0.5, 0.2, 0.1],
            'learning_rate': [0.2, 0.1, 0.05, 0.01]
        },
        'KNN': {
            'n_neighbors': [2, 3, 4, 5]
        },
        'SVM': {
            'gamma': [0.001, 0.01, 0.1, 1],
            'C': [1, 10, 50, 100, 200]
        },
        'GBC': {
            'n_estimators': [100, 500, 1000, 2000],
            'max_depth': [4, 5, 6, 8],
            'max_features': [0.8, 0.5, 0.2, 0.1],
            'learning_rate': [0.2, 0.1, 0.05, 0.01]
        }
    }

    # features_cols = ['Pclass', 'Sex', 'Age_Intevals', 'Family_Members', 'Fare_Per_Passenger', 'Embarked', 'Name_Affiliation', 'Ticket_Code', 'Cabin_Floor']
    # features_cols = ['Pclass', 'Sex', 'Age_Intevals', 'is_Traveling_Alone', 'Fare_Per_Passenger', 'Embarked', 'Name_Affiliation', 'Cabin_Floor']
    features_cols = [
        'Pclass', 'Sex', 'Age_Intervals', 'Name_Affiliation', 'Cabin_Floor',
        'is_Female_with_Children'
    ]
    one_hot_encoding_features = ['Name_Affiliation', 'Cabin_Floor', 'Pclass']
    train_X, train_y = prepare_data(
        cleaned_train_data_df,
        class_col='Survived',
        features_cols=features_cols,
        one_hot_encoding_features=one_hot_encoding_features)
    test_X, test_y = prepare_data(
        cleaned_test_data_df,
        class_col='Survived',
        features_cols=features_cols,
        one_hot_encoding_features=one_hot_encoding_features)
    evaluator = Evaluator(train_X, train_y, test_X, test_y, eval_classifiers,
                          eval_classifiers_params_grid)
    # evaluator.select_features(selection_clf=ExtraTreesClassifier(n_estimators=1000, max_depth=4, random_state=42))

    all_predictions, final_prediction = evaluator.build_models(
        grid_search=False)
    evaluation_df = evaluator.save_predictions_to_df(all_predictions,
                                                     final_prediction)
    submission_df = evaluator.save_predictions_for_submission(evaluation_df)
    evaluation_df.to_csv("test_evaluation_results.csv", index=False)
    submission_df.to_csv("test_submission.csv", index=False)
    accuracy = evaluator.evaluate_performance(test_y,
                                              final_prediction,
                                              performance_metric='accuracy')
    print('Accuracy for ensemble models {} is: {}'.format(
        eval_classifiers.keys(), accuracy))