def data_prep(size=5000, train_file_path='data/train.csv', split=True, remove=None): """ Data preprocessing helper function for local running of the ensemble. INPUTS: size (int) - number of rows of the train data to use train_file_path (string) - filepath to location of train data (as csv) split (bool) - whether to split the data into train and test components or leave as one unit. """ # prepare data for modeling print("Loading data...") train = pd.read_csv(train_file_path) if size > len(train): df = train df= train[:size] print("Preprocessing...") P = PreProcessor() if remove: remove_most = remove else: remove_most = ['Unnamed: 0', 'annotations', 'archived', 'author', 'date', \ 'distinguished', 'edited','gilded', 'in_reply_to', 'is_first_post', 'link_id', 'link_id_ann', 'majority_link', \ 'name', 'parent_id', 'replies', 'retrieved_on', 'saved', \ 'score_hidden', 'subreddit', 'title', 'user_reports', \ 'ann_1', 'ann_2', 'ann_3'] if split: # make splits print("Splitting...") df_train, df_test = train_test_split(df, test_size=0.25) df_train = P.run(df_train, 'body', cols_to_drop=remove_most, direct_to_model=False) df_test = P.run(df_test, 'body', cols_to_drop=remove_most, direct_to_model=False) return df_train, df_test else: df_train = P.run(df, 'body', cols_to_drop=remove_most, direct_to_model=False) return df_train
def main(size=5000, grid=False): """ Composite function designed for running tests. INPUTS: size (int) - number of rows of the data set to use grid (bool) - whether or not to grid search OUTPUTS: None """ # prepare data for modeling print("Loading data...") train = pd.read_csv('data/train.csv') if size > len(train): df = train df = train[:size] #make splits print("Splitting...") df_train, df_test = train_test_split(df, test_size=0.20) print("Preprocessing...") P = PreProcessor() remove_all_but_text = None remove_most = ['Unnamed: 0', 'annotations', 'archived', 'author', 'date', \ 'distinguished', 'edited','gilded', 'in_reply_to', 'is_first_post', 'link_id', 'link_id_ann', 'majority_link', \ 'name', 'parent_id', 'replies', 'retrieved_on', 'saved', \ 'score_hidden', 'subreddit', 'title', 'user_reports', \ 'ann_1', 'ann_2', 'ann_3'] X_train, y_train = P.run(df_train, 'body', cols_to_drop=remove_most, direct_to_model=True) X_test, y_test = P.run(df_test, 'body', cols_to_drop=remove_most, direct_to_model=True) # establish baseline models baseline_scores = run_baseline_modeling(X_train, y_train, X_test, y_test) # look at basic NB model results (reduced to NB) nb_models, NB_base_scores = run_basic_nb_models(X_train, y_train, X_test, y_test) if grid: #run grid search run_alt_model_tests(X_train, y_train, X_test, y_test) else: # look at basic model scores alt_models, alt_scores = run_alt_models(X_train, y_train, X_test, y_test) print("\n\nBaseline Scores: ") for n, s in zip(['Weighted Guess', 'Guess Most Frequent'], baseline_scores): print("{}: {}".format(n, s)) print("Naive Bayes Scores") for n, s in zip(['Naive Bayes', 'Multinomial Bayes'], NB_base_scores): print("{}: {}".format(n, s)) print("Other model Scores: ") for n, s in zip([ 'Logistic Regression', 'Random Forest', 'Gradient Boosting', 'Adaboost' ], alt_scores): print("{}: {}".format(n, s))