def prep_us_nfl_pred(input_df, use_csv=False, data_csv='data/output_dont_commit/reg_output.csv'): if use_csv: input_df = sf.load_model_data(data_csv) # Model Run - K-Means Clustering - Data Preparation # Adding Mean Squared Error Column input_df['Squared Error'] = input_df.apply(lambda row: ( (row['Total points'] - row['Total points predicted'])**2), axis=1) # Dropping Unwanted Columns us_columns_to_drop = [ 'name', 'pos', 'year', 'home', 'against', 'week', 'score', 'opp_score', 'month', 'team', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd', 'fumble', 'pass_yd', 'pass_td', 'pass_int', 'rush_att_av', 'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av', 'pass_yd_av', 'pass_td_av', 'pass_int_av', 'rush_att_av^2', 'rush_td_av^2', 'rush_yd_av^2', 'rec_td_av^2', 'rec_yd_av^2', 'fumble_av^2', 'pass_yd_av^2', 'pass_td_av^2', 'pass_int_av^2', 'points', 'Total points predicted' ] us_input_df = input_df.drop(us_columns_to_drop, axis=1) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" + sf.Color.END) log.info(us_input_df.head(3)) # Converting to NumPy Array input_npa = us_input_df.as_matrix().astype(np.float) return input_df, input_npa
def prep_us_nfl_pred(input_df, use_csv=False, data_csv='data/output_dont_commit/reg_output.csv'): if use_csv: input_df = sf.load_model_data(data_csv) # Model Run - K-Means Clustering - Data Preparation # Adding Mean Squared Error Column input_df['Squared Error'] = input_df.apply(lambda row: ((row['Total points'] - row['Total points predicted']) ** 2), axis=1) # Dropping Unwanted Columns us_columns_to_drop = ['name', 'pos', 'year', 'home', 'against', 'week', 'score', 'opp_score', 'month', 'team', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd', 'fumble', 'pass_yd', 'pass_td', 'pass_int', 'rush_att_av', 'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av', 'pass_yd_av', 'pass_td_av', 'pass_int_av', 'rush_att_av^2', 'rush_td_av^2', 'rush_yd_av^2', 'rec_td_av^2', 'rec_yd_av^2', 'fumble_av^2', 'pass_yd_av^2', 'pass_td_av^2', 'pass_int_av^2', 'points', 'Total points predicted'] us_input_df = input_df.drop(us_columns_to_drop, axis=1) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" + sf.Color.END) log.info(us_input_df.head(3)) # Converting to NumPy Array input_npa = us_input_df.as_matrix().astype(np.float) return input_df, input_npa
def prep_reg_nfl_pred(feature_scaling=False, data_csv='dataproc/nfl_pred_data.csv'): log = logging.getLogger('debug') nfl_df = sf.load_model_data(data_csv) # Previewing Column Names col_names = nfl_df.columns.tolist() log.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END) log.info(col_names) # Previewing Source Data to_show = col_names[:] log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" + sf.Color.END) log.info(nfl_df[to_show].head(3)) # Isolate Output Data y = np.array(nfl_df['points']) # Columns to drop (From Features Data Frame) to_drop = [ 'points', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd', 'fumble', 'pass_yd', 'pass_td', 'pass_int' ] base_feat_space = nfl_df.drop(to_drop, axis=1) # Previewing Feature Names feature_names = base_feat_space.columns.tolist() log.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END) log.debug(feature_names) # Using Label Encoding to Rebase the Values in these Columns base_feat_space = MultiColumnLabelEncoder( columns=['name', 'pos', 'year', 'against', 'team']).fit_transform( base_feat_space) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END) log.info(base_feat_space.head(3)) # Make NumPy Array For Base Features # base_x = base_feat_space.as_matrix().astype(np.float) # Handling Polynomials based on Base Features features_with_polynomials = [ 'rush_att_av', 'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av', 'pass_yd_av', 'pass_td_av', 'pass_int_av' ] polynomial_feat_space = base_feat_space[features_with_polynomials].copy( deep=True) for feature in features_with_polynomials: new_feature = feature + '^2' log.debug('New Feature %s Added Based on %s', new_feature, feature) polynomial_feat_space[new_feature] = polynomial_feat_space[feature]**2 # Removing original features from polynomial feature space polynomial_feat_space = polynomial_feat_space.drop( features_with_polynomials, axis=1) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Polynomial Features:" + sf.Color.END) log.info(polynomial_feat_space.head(3)) # Make NumPy Array for Polynomial Features # polynomial_x = polynomial_feat_space.as_matrix().astype(np.float) # Scaling & Normalization if feature_scaling: # Base Features - Scaling & Normalization base_feat_space = pd.DataFrame( preprocessing.StandardScaler().fit_transform(base_feat_space), columns=base_feat_space.columns) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Base Features Scaling:" + sf.Color.END) log.info(base_feat_space.head(3)) # Polynomial Features - Scaling & Normalization polynomial_feat_space = pd.DataFrame( preprocessing.StandardScaler().fit_transform( polynomial_feat_space), columns=polynomial_feat_space.columns) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Polynomial Features Scaling:" + sf.Color.END) log.info(polynomial_feat_space.head(3)) # Merge Base & Polynomial Features; Convert to NumPy Array x_df = pd.concat([base_feat_space, polynomial_feat_space], axis=1) x = x_df.as_matrix().astype(np.float) # Handle Feature Scaling and Normalization # if feature_scaling: # scaler = StandardScaler() # x = scaler.fit_transform(x) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" + sf.Color.END) log.info(x[0:3]) log.info("Feature Space holds %d Observations and %d Features" % x.shape) return [x, y, nfl_df]
def prep_reg_nfl_pred(feature_scaling=False, data_csv='dataproc/nfl_pred_data.csv'): log = logging.getLogger('debug') nfl_df = sf.load_model_data(data_csv) # Previewing Column Names col_names = nfl_df.columns.tolist() log.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END) log.info(col_names) # Previewing Source Data to_show = col_names[:] log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" + sf.Color.END) log.info(nfl_df[to_show].head(3)) # Isolate Output Data y = np.array(nfl_df['points']) # Columns to drop (From Features Data Frame) to_drop = ['points', 'rush_att', 'rush_td', 'rush_yd', 'rec_td', 'rec_yd', 'fumble', 'pass_yd', 'pass_td', 'pass_int'] base_feat_space = nfl_df.drop(to_drop, axis=1) # Previewing Feature Names feature_names = base_feat_space.columns.tolist() log.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END) log.debug(feature_names) # Using Label Encoding to Rebase the Values in these Columns base_feat_space = MultiColumnLabelEncoder(columns=['name', 'pos', 'year', 'against', 'team']).fit_transform( base_feat_space) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END) log.info(base_feat_space.head(3)) # Make NumPy Array For Base Features # base_x = base_feat_space.as_matrix().astype(np.float) # Handling Polynomials based on Base Features features_with_polynomials = ['rush_att_av', 'rush_td_av', 'rush_yd_av', 'rec_td_av', 'rec_yd_av', 'fumble_av', 'pass_yd_av', 'pass_td_av', 'pass_int_av'] polynomial_feat_space = base_feat_space[features_with_polynomials].copy(deep=True) for feature in features_with_polynomials: new_feature = feature + '^2' log.debug('New Feature %s Added Based on %s', new_feature, feature) polynomial_feat_space[new_feature] = polynomial_feat_space[feature] ** 2 # Removing original features from polynomial feature space polynomial_feat_space = polynomial_feat_space.drop(features_with_polynomials, axis=1) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Polynomial Features:" + sf.Color.END) log.info(polynomial_feat_space.head(3)) # Make NumPy Array for Polynomial Features # polynomial_x = polynomial_feat_space.as_matrix().astype(np.float) # Scaling & Normalization if feature_scaling: # Base Features - Scaling & Normalization base_feat_space = pd.DataFrame(preprocessing.StandardScaler().fit_transform(base_feat_space), columns=base_feat_space.columns) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Base Features Scaling:" + sf.Color.END) log.info(base_feat_space.head(3)) # Polynomial Features - Scaling & Normalization polynomial_feat_space = pd.DataFrame(preprocessing.StandardScaler().fit_transform(polynomial_feat_space), columns=polynomial_feat_space.columns) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Polynomial Features Scaling:" + sf.Color.END) log.info(polynomial_feat_space.head(3)) # Merge Base & Polynomial Features; Convert to NumPy Array x_df = pd.concat([base_feat_space, polynomial_feat_space], axis=1) x = x_df.as_matrix().astype(np.float) # Handle Feature Scaling and Normalization # if feature_scaling: # scaler = StandardScaler() # x = scaler.fit_transform(x) log.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" + sf.Color.END) log.info(x[0:3]) log.info("Feature Space holds %d Observations and %d Features" % x.shape) return [x, y, nfl_df]