def prep_us_nfl_pred(input_df,
                     use_csv=False,
                     data_csv='data/output_dont_commit/reg_output.csv'):

    if use_csv:
        input_df = sf.load_model_data(data_csv)

    # Model Run - K-Means Clustering - Data Preparation
    # Adding Mean Squared Error Column
    input_df['Squared Error'] = input_df.apply(lambda row: (
        (row['Total points'] - row['Total points predicted'])**2),
                                               axis=1)

    # Dropping Unwanted Columns
    us_columns_to_drop = ['Total points', 'Total points predicted']
    us_input_df = input_df.drop(us_columns_to_drop, axis=1)

    logger.info(sf.Color.BOLD + sf.Color.GREEN +
                "Sample Clustering Input Data:" + sf.Color.END)
    logger.info(us_input_df.head(3))

    # Using Label Encoding to Rebase the Values in these Columns
    us_input_df = MultiColumnLabelEncoder(
        columns=['name', 'year', 'team']).fit_transform(us_input_df)
    logger.info(sf.Color.BOLD + sf.Color.GREEN +
                "Sample Data Post Label Encoding:" + sf.Color.END)
    logger.info(us_input_df.head(3))

    # Converting to NumPy Array
    us_input_npa = us_input_df.as_matrix().astype(np.float)

    return input_df, us_input_npa
def prep_us_nfl_pred(input_df, use_csv=False, data_csv='data/output_dont_commit/reg_output.csv'):

    if use_csv:
        input_df = sf.load_model_data(data_csv)

    # Model Run - K-Means Clustering - Data Preparation
    # Adding Mean Squared Error Column
    input_df['Squared Error'] = input_df.apply(lambda row: ((row['Total points'] -
                                                             row['Total points predicted']) ** 2), axis=1)

    # Dropping Unwanted Columns
    us_columns_to_drop = ['Total points', 'Total points predicted']
    us_input_df = input_df.drop(us_columns_to_drop, axis=1)

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Clustering Input Data:" + sf.Color.END)
    logger.info(us_input_df.head(3))

    # Using Label Encoding to Rebase the Values in these Columns
    us_input_df = MultiColumnLabelEncoder(columns=['name', 'year', 'team']).fit_transform(us_input_df)
    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END)
    logger.info(us_input_df.head(3))

    # Converting to NumPy Array
    us_input_npa = us_input_df.as_matrix().astype(np.float)

    return input_df, us_input_npa
def prep_reg_nfl_pred(feature_scaling=False,
                      data_csv='data/nfl_pred_data.csv'):
    nfl_df = sf.load_model_data(data_csv)

    col_names = nfl_df.columns.tolist()

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" +
                sf.Color.END)
    logger.info(col_names)

    to_show = col_names[:]

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" +
                sf.Color.END)
    logger.info(nfl_df[to_show].head(3))

    # Isolate Target Data
    y = np.array(nfl_df['Total points'])

    # Columns to Drop (For Features Data Frame)
    to_drop = ['Total points']
    nfl_feat_space = nfl_df.drop(to_drop, axis=1)

    # Capturing Feature Names
    feature_names = nfl_feat_space.columns.tolist()
    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" +
                sf.Color.END)
    logger.debug(feature_names)

    # Using Label Encoding to Rebase the Values in these Columns
    nfl_feat_space = MultiColumnLabelEncoder(
        columns=['name', 'year', 'team']).fit_transform(nfl_feat_space)
    logger.info(sf.Color.BOLD + sf.Color.GREEN +
                "Sample Data Post Label Encoding:" + sf.Color.END)
    logger.info(nfl_feat_space.head(3))

    # Make NumPy Array
    x = nfl_feat_space.as_matrix().astype(np.float)

    # Handle Feature Scaling and Normalization
    if feature_scaling:
        scaler = StandardScaler()
        x = scaler.fit_transform(x)

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" +
                sf.Color.END)
    logger.info(x[0:3])

    logger.info("Feature Space holds %d Observations and %d Features" %
                x.shape)

    return [x, y, nfl_df]
def prep_reg_nfl_pred(feature_scaling=False, data_csv='data/nfl_pred_data.csv'):
    nfl_df = sf.load_model_data(data_csv)

    col_names = nfl_df.columns.tolist()

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Column Names:" + sf.Color.END)
    logger.info(col_names)

    to_show = col_names[:]

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Loaded Data:" + sf.Color.END)
    logger.info(nfl_df[to_show].head(3))

    # Isolate Target Data
    y = np.array(nfl_df['Total points'])

    # Columns to Drop (For Features Data Frame)
    to_drop = ['Total points']
    nfl_feat_space = nfl_df.drop(to_drop, axis=1)

    # Capturing Feature Names
    feature_names = nfl_feat_space.columns.tolist()
    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Feature Names:" + sf.Color.END)
    logger.debug(feature_names)

    # Using Label Encoding to Rebase the Values in these Columns
    nfl_feat_space = MultiColumnLabelEncoder(columns=['name', 'year', 'team']).fit_transform(nfl_feat_space)
    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Data Post Label Encoding:" + sf.Color.END)
    logger.info(nfl_feat_space.head(3))

    # Make NumPy Array
    x = nfl_feat_space.as_matrix().astype(np.float)

    # Handle Feature Scaling and Normalization
    if feature_scaling:
        scaler = StandardScaler()
        x = scaler.fit_transform(x)

    logger.info(sf.Color.BOLD + sf.Color.GREEN + "Sample Transformed Data:" + sf.Color.END)
    logger.info(x[0:3])

    logger.info("Feature Space holds %d Observations and %d Features" % x.shape)

    return [x, y, nfl_df]
##################################################################################################################

if __name__ == "__main__":

    start_time = time.time()

    # Machine Learning Chosen Models
    estimator = KMeans

    # Model Run - Clustering - K-Means - Estimator Keywords = dict()
    estimator_keywords = dict(init='k-means++', n_init=10, verbose=0)

    # Model Run - K-Means Clustering - Data Preparation
    # Load Input Data Frame
    input_df = sf.load_model_data('data/output_dont_commit/reg_output.csv')

    # Dropping Unwanted Columns
    columns_to_drop = ['Total points', 'Total points predicted']
    us_input_df = input_df.drop(columns_to_drop, axis=1)

    # Converting to NumPy Array
    us_input_npa = us_input_df.as_matrix().astype(np.float)

    # Model Run - K-Means Clustering
    us_kcluster_df = run_clustering(us_input_npa,
                                    make_plots=False,
                                    clf_class=KMeans,
                                    min_cluster=3,
                                    max_cluster=3,
                                    **estimator_keywords)
##################################################################################################################

if __name__ == "__main__":

    start_time = time.time()

    # Machine Learning Chosen Models
    estimator = KMeans

    # Model Run - Clustering - K-Means - Estimator Keywords = dict()
    estimator_keywords = dict(init='k-means++', n_init=10, verbose=0)

    # Model Run - K-Means Clustering - Data Preparation
    # Load Input Data Frame
    input_df = sf.load_model_data('data/output_dont_commit/reg_output.csv')

    # Dropping Unwanted Columns
    columns_to_drop = ['Total points', 'Total points predicted']
    us_input_df = input_df.drop(columns_to_drop, axis=1)

    # Converting to NumPy Array
    us_input_npa = us_input_df.as_matrix().astype(np.float)

    # Model Run - K-Means Clustering
    us_kcluster_df = run_clustering(us_input_npa, make_plots=False, clf_class=KMeans, min_cluster=3,
                                  max_cluster=3, **estimator_keywords)

    # Model Run - K-Means Clustering - Output Processing
    # Combine Input & Output Data Frames
    us_result_df = pd.concat([input_df, us_kcluster_df], axis=1)