def prep_us_nfl_pred(input_df, use_csv=False, data_csv='data/output_dont_commit/reg_output.csv'): if use_csv: input_df = sf.load_model_data(data_csv) # Model Run - K-Means Clustering - Data Preparation # Adding Mean Squared Error Column input_df['Squared Error'] = input_df.apply(lambda row: ( (row['Total points'] - row['Total points predicted'])**2), axis=1) # Dropping Unwanted Columns us_columns_to_drop = ['Total points', 'Total points predicted'] us_input_df = input_df.drop(us_columns_to_drop, axis=1) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" + sf.Color.END) logger.info(us_input_df.head(3)) # Using Label Encoding to Rebase the Values in these Columns us_input_df = MultiColumnLabelEncoder( columns=['name', 'year', 'team']).fit_transform(us_input_df) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END) logger.info(us_input_df.head(3)) # Converting to NumPy Array us_input_npa = us_input_df.as_matrix().astype(np.float) return input_df, us_input_npa
def prep_us_nfl_pred(input_df, use_csv=False, data_csv='data/output_dont_commit/reg_output.csv'): if use_csv: input_df = sf.load_model_data(data_csv) # Model Run - K-Means Clustering - Data Preparation # Adding Mean Squared Error Column input_df['Squared Error'] = input_df.apply(lambda row: ((row['Total points'] - row['Total points predicted']) ** 2), axis=1) # Dropping Unwanted Columns us_columns_to_drop = ['Total points', 'Total points predicted'] us_input_df = input_df.drop(us_columns_to_drop, axis=1) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" + sf.Color.END) logger.info(us_input_df.head(3)) # Using Label Encoding to Rebase the Values in these Columns us_input_df = MultiColumnLabelEncoder(columns=['name', 'year', 'team']).fit_transform(us_input_df) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END) logger.info(us_input_df.head(3)) # Converting to NumPy Array us_input_npa = us_input_df.as_matrix().astype(np.float) return input_df, us_input_npa
def prep_reg_nfl_pred(feature_scaling=False, data_csv='data/nfl_pred_data.csv'): nfl_df = sf.load_model_data(data_csv) col_names = nfl_df.columns.tolist() logger.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END) logger.info(col_names) to_show = col_names[:] logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" + sf.Color.END) logger.info(nfl_df[to_show].head(3)) # Isolate Target Data y = np.array(nfl_df['Total points']) # Columns to Drop (For Features Data Frame) to_drop = ['Total points'] nfl_feat_space = nfl_df.drop(to_drop, axis=1) # Capturing Feature Names feature_names = nfl_feat_space.columns.tolist() logger.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END) logger.debug(feature_names) # Using Label Encoding to Rebase the Values in these Columns nfl_feat_space = MultiColumnLabelEncoder( columns=['name', 'year', 'team']).fit_transform(nfl_feat_space) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END) logger.info(nfl_feat_space.head(3)) # Make NumPy Array x = nfl_feat_space.as_matrix().astype(np.float) # Handle Feature Scaling and Normalization if feature_scaling: scaler = StandardScaler() x = scaler.fit_transform(x) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" + sf.Color.END) logger.info(x[0:3]) logger.info("Feature Space holds %d Observations and %d Features" % x.shape) return [x, y, nfl_df]
def prep_reg_nfl_pred(feature_scaling=False, data_csv='data/nfl_pred_data.csv'): nfl_df = sf.load_model_data(data_csv) col_names = nfl_df.columns.tolist() logger.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END) logger.info(col_names) to_show = col_names[:] logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" + sf.Color.END) logger.info(nfl_df[to_show].head(3)) # Isolate Target Data y = np.array(nfl_df['Total points']) # Columns to Drop (For Features Data Frame) to_drop = ['Total points'] nfl_feat_space = nfl_df.drop(to_drop, axis=1) # Capturing Feature Names feature_names = nfl_feat_space.columns.tolist() logger.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END) logger.debug(feature_names) # Using Label Encoding to Rebase the Values in these Columns nfl_feat_space = MultiColumnLabelEncoder(columns=['name', 'year', 'team']).fit_transform(nfl_feat_space) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END) logger.info(nfl_feat_space.head(3)) # Make NumPy Array x = nfl_feat_space.as_matrix().astype(np.float) # Handle Feature Scaling and Normalization if feature_scaling: scaler = StandardScaler() x = scaler.fit_transform(x) logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" + sf.Color.END) logger.info(x[0:3]) logger.info("Feature Space holds %d Observations and %d Features" % x.shape) return [x, y, nfl_df]
################################################################################################################## if __name__ == "__main__": start_time = time.time() # Machine Learning Chosen Models estimator = KMeans # Model Run - Clustering - K-Means - Estimator Keywords = dict() estimator_keywords = dict(init='k-means++', n_init=10, verbose=0) # Model Run - K-Means Clustering - Data Preparation # Load Input Data Frame input_df = sf.load_model_data('data/output_dont_commit/reg_output.csv') # Dropping Unwanted Columns columns_to_drop = ['Total points', 'Total points predicted'] us_input_df = input_df.drop(columns_to_drop, axis=1) # Converting to NumPy Array us_input_npa = us_input_df.as_matrix().astype(np.float) # Model Run - K-Means Clustering us_kcluster_df = run_clustering(us_input_npa, make_plots=False, clf_class=KMeans, min_cluster=3, max_cluster=3, **estimator_keywords)
################################################################################################################## if __name__ == "__main__": start_time = time.time() # Machine Learning Chosen Models estimator = KMeans # Model Run - Clustering - K-Means - Estimator Keywords = dict() estimator_keywords = dict(init='k-means++', n_init=10, verbose=0) # Model Run - K-Means Clustering - Data Preparation # Load Input Data Frame input_df = sf.load_model_data('data/output_dont_commit/reg_output.csv') # Dropping Unwanted Columns columns_to_drop = ['Total points', 'Total points predicted'] us_input_df = input_df.drop(columns_to_drop, axis=1) # Converting to NumPy Array us_input_npa = us_input_df.as_matrix().astype(np.float) # Model Run - K-Means Clustering us_kcluster_df = run_clustering(us_input_npa, make_plots=False, clf_class=KMeans, min_cluster=3, max_cluster=3, **estimator_keywords) # Model Run - K-Means Clustering - Output Processing # Combine Input & Output Data Frames us_result_df = pd.concat([input_df, us_kcluster_df], axis=1)