def data_prep(size=5000, train_file_path='data/train.csv', split=True, remove=None):
    """
    Data preprocessing helper function for local running of the ensemble.
    INPUTS:
    size (int) - number of rows of the train data to use
    train_file_path (string) - filepath to location of train data (as csv)
    split (bool) - whether to split the data into train and test components or
                    leave as one unit.
    """
    # prepare data for modeling
    print("Loading data...")
    train = pd.read_csv(train_file_path)
    if size > len(train):
        df = train
    df= train[:size]

    print("Preprocessing...")
    P = PreProcessor()

    if remove:
        remove_most = remove
    else:
        remove_most = ['Unnamed: 0', 'annotations', 'archived', 'author', 'date', \
                   'distinguished', 'edited','gilded', 'in_reply_to',
                   'is_first_post', 'link_id', 'link_id_ann', 'majority_link', \
                   'name', 'parent_id', 'replies', 'retrieved_on', 'saved', \
                   'score_hidden', 'subreddit', 'title', 'user_reports', \
                   'ann_1', 'ann_2', 'ann_3']

    if split:
        # make splits
        print("Splitting...")
        df_train, df_test = train_test_split(df, test_size=0.25)
        df_train = P.run(df_train, 'body', cols_to_drop=remove_most, direct_to_model=False)
        df_test = P.run(df_test, 'body', cols_to_drop=remove_most, direct_to_model=False)
        return df_train, df_test
    else:
        df_train = P.run(df, 'body', cols_to_drop=remove_most, direct_to_model=False)
        return df_train
def main(size=5000, grid=False):
    """
    Composite function designed for running tests.
    INPUTS:
    size (int) - number of rows of the data set to use
    grid (bool) - whether or not to grid search
    OUTPUTS:
    None
    """
    # prepare data for modeling
    print("Loading data...")
    train = pd.read_csv('data/train.csv')
    if size > len(train):
        df = train
    df = train[:size]

    #make splits
    print("Splitting...")
    df_train, df_test = train_test_split(df, test_size=0.20)

    print("Preprocessing...")
    P = PreProcessor()
    remove_all_but_text = None

    remove_most = ['Unnamed: 0', 'annotations', 'archived', 'author', 'date', \
                   'distinguished', 'edited','gilded', 'in_reply_to',
                   'is_first_post', 'link_id', 'link_id_ann', 'majority_link', \
                   'name', 'parent_id', 'replies', 'retrieved_on', 'saved', \
                   'score_hidden', 'subreddit', 'title', 'user_reports', \
                   'ann_1', 'ann_2', 'ann_3']

    X_train, y_train = P.run(df_train,
                             'body',
                             cols_to_drop=remove_most,
                             direct_to_model=True)
    X_test, y_test = P.run(df_test,
                           'body',
                           cols_to_drop=remove_most,
                           direct_to_model=True)

    # establish baseline models
    baseline_scores = run_baseline_modeling(X_train, y_train, X_test, y_test)
    # look at basic NB model results (reduced to NB)
    nb_models, NB_base_scores = run_basic_nb_models(X_train, y_train, X_test,
                                                    y_test)

    if grid:
        #run grid search
        run_alt_model_tests(X_train, y_train, X_test, y_test)
    else:
        # look at basic model scores
        alt_models, alt_scores = run_alt_models(X_train, y_train, X_test,
                                                y_test)
        print("\n\nBaseline Scores: ")
        for n, s in zip(['Weighted Guess', 'Guess Most Frequent'],
                        baseline_scores):
            print("{}: {}".format(n, s))
        print("Naive Bayes Scores")
        for n, s in zip(['Naive Bayes', 'Multinomial Bayes'], NB_base_scores):
            print("{}: {}".format(n, s))
        print("Other model Scores: ")
        for n, s in zip([
                'Logistic Regression', 'Random Forest', 'Gradient Boosting',
                'Adaboost'
        ], alt_scores):
            print("{}: {}".format(n, s))