def extract(self, year):

        print('Creating {0} for year {1:d}'.format(self.new_table_s, year))

        df = pd.read_excel(os.path.join(config.data_path, 'budgets',
                                        self.orig_table_s_d[year] + '.xlsx'))
        filtered_df = df[df['Aid Category'] == 'Sum of Above Aid Categories']
        trimmed_df = filtered_df.loc[:, ['BEDS Code', self.column_s[year]]]
        trimmed_df.columns = ['district_{0}'.format(year),
                              '{1}_{0:d}'.format(year, self.new_table_s)]
        final_df = trimmed_df.set_index('district_{0}'.format(year))
        utilities.write_to_sql_table(final_df, 'temp{0:d}_final'.format(year),
                                     'temp')
Esempio n. 2
0
    def extract(self, year):

        print('Creating {0} for year {1:d}'.format(self.new_table_s, year))

        df = pd.read_excel(
            os.path.join(config.data_path, 'budgets',
                         self.orig_table_s_d[year] + '.xlsx'))
        filtered_df = df[df['Aid Category'] == 'Sum of Above Aid Categories']
        trimmed_df = filtered_df.loc[:, ['BEDS Code', self.column_s[year]]]
        trimmed_df.columns = [
            'district_{0}'.format(year),
            '{1}_{0:d}'.format(year, self.new_table_s)
        ]
        final_df = trimmed_df.set_index('district_{0}'.format(year))
        utilities.write_to_sql_table(final_df, 'temp{0:d}_final'.format(year),
                                     'temp')
Esempio n. 3
0
def predict_a_feature(input_data_a_d,
                      primary_feature_s,
                      aux_features=True,
                      save_data=False,
                      **kwargs):
    """ Wraps around fit_and_predict: runs various regression models on the input school statistic and outputs/plots the results """

    print('\n\nStarting prediction for {0}.\n'.format(primary_feature_s))
    with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f:
        f.write('\n\nStarting prediction for {0}.\n'.format(primary_feature_s))

    data_a_d = input_data_a_d.copy()
    index_a = data_a_d[primary_feature_s][:, 0]

    ## Drop the ENTITY_CD column
    for feature_s in data_a_d.iterkeys():
        data_a_d[feature_s] = data_a_d[feature_s][:, 1:]

    ## Split data
    main_data_a = data_a_d[primary_feature_s]
    data_a_d.pop(primary_feature_s)
    feature_s_l = sorted(data_a_d.keys())
    if not aux_features:
        data_a_d = {}
        feature_s_l = []

    ## Run regression models, validate and predict future scores, and run controls
    all_results_d = {}

    # Run autoregression with different lags on raw test scores
    lag_l = range(1, 5)
    for lag in lag_l:
        model_s = 'raw_lag{:d}'.format(lag)
        print(model_s + ':')
        all_results_d[model_s] = fit_and_predict(main_data_a,
                                                 AutoRegression,
                                                 primary_feature_s,
                                                 model_s,
                                                 aux_data_a_d=data_a_d,
                                                 diff=False,
                                                 feature_s_l=feature_s_l,
                                                 lag=lag,
                                                 **kwargs)


#    # Run autogression with different lags on diff of test scores w.r.t. year
#    lag_l = range(1, 4)
#    for lag in lag_l:
#        model_s = 'diff_lag{:d}'.format(lag)
#        print(model_s + ':')
#        all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression,
#                                                 primary_feature_s,
#                                                 model_s,
#                                                 aux_data_a_d=data_a_d,
#                                                 diff=True,
#                                                 feature_s_l=feature_s_l,
#                                                 lag=lag,
#                                                 **kwargs)

#    lag_l = range(1, 5)
#    for lag in lag_l:
#        model_s = 'ind_raw_lag{:d}'.format(lag)
#        print(model_s + ':')
#        all_results_d[model_s] = fit_and_predict(main_data_a, IndependentAutoRegression,
#                                                 primary_feature_s,
#                                                 model_s,
#                                                 lag=lag,
#                                                 **kwargs)

# Run control: prediction is same as mean over years in training set
    model_s = 'z_mean_over_years_score_control'
    print(model_s + ':')
    all_results_d[model_s] = fit_and_predict(main_data_a, MeanOverYears,
                                             primary_feature_s, model_s,
                                             **kwargs)

    # Run control: prediction is same as previous year's data
    model_s = 'z_same_as_last_year_score_control'
    print(model_s + ':')
    all_results_d[model_s] = fit_and_predict(main_data_a, SameAsLastYear,
                                             primary_feature_s, model_s,
                                             **kwargs)

    # Run control: prediction is same as previous year's data
    model_s = 'z_same_change_as_last_year_score_control'
    print(model_s + ':')
    all_results_d[model_s] = fit_and_predict(main_data_a, SameChangeAsLastYear,
                                             primary_feature_s, model_s,
                                             **kwargs)

    chosen_baseline_s_l = [
        'z_mean_over_years_score_control', 'z_same_as_last_year_score_control'
    ]
    all_train_mses_d = {
        key: value['cross_val_train_rms_error']
        for (key, value) in all_results_d.iteritems()
    }
    all_test_mses_d = {
        key: value['cross_val_test_rms_error']
        for (key, value) in all_results_d.iteritems()
    }
    all_models_s_l = sorted(all_results_d.keys())
    with open(os.path.join(config.plot_path, 'RMSE_list.txt'), 'a') as f:
        f.write('\n\n{0}:\n'.format(primary_feature_s))
        for model_s in all_models_s_l:
            f.write('\n{0}:\n'.format(model_s))
            f.write('\tTrain RMSE:\n\t\t{:1.5f}\n'.format(
                all_train_mses_d[model_s]))
            f.write('\tTest RMSE:\n\t\t{:1.5f}\n'.format(
                all_test_mses_d[model_s]))
            for chosen_baseline_s in chosen_baseline_s_l:
                f.write('\t{0}: \n\t\t{1:1.5g}\n'.format(
                    chosen_baseline_s, all_test_mses_d[model_s] /
                    all_test_mses_d[chosen_baseline_s]))

    ## Plot MSEs of all regression models

    model_s_l = sorted(all_train_mses_d.keys())
    fig = plt.figure(figsize=(1.5 * len(model_s_l), 12))
    ax = fig.add_axes([0.10, 0.40, 0.80, 0.50])

    # Generating bar values
    value_s_l = [
        'train_rms_error', 'test_rms_error', 'cross_val_train_rms_error',
        'cross_val_test_rms_error', 'three_year_train_rms_error',
        'three_year_test_rms_error'
    ]
    value_l_d = {}
    for value_s in value_s_l:
        value_l_d[value_s] = [all_results_d[iter_model_s][value_s]\
                              for iter_model_s in model_s_l]

    # Generate bar positions
    bar_width = 0.12
    value_position_l_d = {}
    for i_value, value_s in enumerate(value_s_l):
        value_position_l_d[value_s] = np.arange(
            len(model_s_l)) + (i_value - 3) * bar_width

    # Generate colors
    value_color_l = ['r', 'y', 'g', 'c', 'b', 'm']

    # Plot bars
    for i_value, value_s in enumerate(value_s_l):
        ax.bar(value_position_l_d[value_s],
               value_l_d[value_s],
               bar_width,
               color=value_color_l[i_value],
               label=value_s)

    # Formatting
    ax.set_title(
        'Comparison of RMS error of autoregression algorithms vs. controls')
    ax.set_xticks(np.arange(len(model_s_l)))
    ax.set_xticklabels(model_s_l, rotation=90)
    ax.set_ylabel('Root mean squared error')
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.50))
    for chosen_baseline_s in chosen_baseline_s_l:
        ax.axhline(y=all_test_mses_d[chosen_baseline_s], color=(0.5, 0.5, 0.5))
    ax.set_ylim([0, 1.5 * all_test_mses_d['z_mean_over_years_score_control']])
    save_path = os.path.join(config.plot_path, 'create_predictions')
    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    plt.savefig(
        os.path.join(
            save_path,
            'rms_error_all_models__{0}.png'.format(primary_feature_s)))

    ## Save data to the SQL database
    if save_data:
        model_to_save_s = 'raw_lag1'
        new_column_s_l = ['ENTITY_CD'] + \
            ['{0}_prediction_{1:d}'.format(primary_feature_s, year)
             for year in config.prediction_year_l]
        prediction_a = np.concatenate((index_a.reshape(
            -1, 1), all_results_d[model_to_save_s]['prediction_a']),
                                      axis=1)
        prediction_df = pd.DataFrame(prediction_a, columns=new_column_s_l)
        utilities.write_to_sql_table(
            prediction_df, '{0}_prediction'.format(primary_feature_s),
            'joined')
def predict_a_feature(input_data_a_d, primary_feature_s,
                      aux_features=True, save_data=False,
                      **kwargs):
    """ Wraps around fit_and_predict: runs various regression models on the input school statistic and outputs/plots the results """

    print('\n\nStarting prediction for {0}.\n'.format(primary_feature_s))
    with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f:
        f.write('\n\nStarting prediction for {0}.\n'.format(primary_feature_s))

    data_a_d = input_data_a_d.copy()
    index_a = data_a_d[primary_feature_s][:, 0]


    ## Drop the ENTITY_CD column
    for feature_s in data_a_d.iterkeys():
        data_a_d[feature_s] = data_a_d[feature_s][:, 1:]


    ## Split data
    main_data_a = data_a_d[primary_feature_s]
    data_a_d.pop(primary_feature_s)
    feature_s_l = sorted(data_a_d.keys())
    if not aux_features:
        data_a_d = {}
        feature_s_l = []


    ## Run regression models, validate and predict future scores, and run controls
    all_results_d = {}

    # Run autoregression with different lags on raw test scores
    lag_l = range(1, 5)
    for lag in lag_l:
        model_s = 'raw_lag{:d}'.format(lag)
        print(model_s + ':')
        all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression,
                                                 primary_feature_s,
                                                 model_s,
                                                 aux_data_a_d=data_a_d,
                                                 diff=False,
                                                 feature_s_l=feature_s_l,
                                                 lag=lag,
                                                 **kwargs)

#    # Run autogression with different lags on diff of test scores w.r.t. year
#    lag_l = range(1, 4)
#    for lag in lag_l:
#        model_s = 'diff_lag{:d}'.format(lag)
#        print(model_s + ':')
#        all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression,
#                                                 primary_feature_s,
#                                                 model_s,
#                                                 aux_data_a_d=data_a_d,
#                                                 diff=True,
#                                                 feature_s_l=feature_s_l,
#                                                 lag=lag,
#                                                 **kwargs)

#    lag_l = range(1, 5)
#    for lag in lag_l:
#        model_s = 'ind_raw_lag{:d}'.format(lag)
#        print(model_s + ':')
#        all_results_d[model_s] = fit_and_predict(main_data_a, IndependentAutoRegression,
#                                                 primary_feature_s,
#                                                 model_s,
#                                                 lag=lag,
#                                                 **kwargs)

    # Run control: prediction is same as mean over years in training set
    model_s = 'z_mean_over_years_score_control'
    print(model_s + ':')
    all_results_d[model_s] = fit_and_predict(main_data_a, MeanOverYears,
                                             primary_feature_s, model_s,
                                             **kwargs)

    # Run control: prediction is same as previous year's data
    model_s = 'z_same_as_last_year_score_control'
    print(model_s + ':')
    all_results_d[model_s] = fit_and_predict(main_data_a, SameAsLastYear,
                                             primary_feature_s, model_s,
                                             **kwargs)

    # Run control: prediction is same as previous year's data
    model_s = 'z_same_change_as_last_year_score_control'
    print(model_s + ':')
    all_results_d[model_s] = fit_and_predict(main_data_a, SameChangeAsLastYear,
                                             primary_feature_s, model_s,
                                             **kwargs)

    chosen_baseline_s_l = ['z_mean_over_years_score_control',
                           'z_same_as_last_year_score_control']
    all_train_mses_d = {key: value['cross_val_train_rms_error'] for (key, value) in all_results_d.iteritems()}
    all_test_mses_d = {key: value['cross_val_test_rms_error'] for (key, value) in all_results_d.iteritems()}
    all_models_s_l = sorted(all_results_d.keys())
    with open(os.path.join(config.plot_path, 'RMSE_list.txt'), 'a') as f:
        f.write('\n\n{0}:\n'.format(primary_feature_s))
        for model_s in all_models_s_l:
            f.write('\n{0}:\n'.format(model_s))
            f.write('\tTrain RMSE:\n\t\t{:1.5f}\n'.format(all_train_mses_d[model_s]))
            f.write('\tTest RMSE:\n\t\t{:1.5f}\n'.format(all_test_mses_d[model_s]))
            for chosen_baseline_s in chosen_baseline_s_l:
                f.write('\t{0}: \n\t\t{1:1.5g}\n'.format(chosen_baseline_s,
                      all_test_mses_d[model_s]/all_test_mses_d[chosen_baseline_s]))


    ## Plot MSEs of all regression models

    model_s_l = sorted(all_train_mses_d.keys())
    fig = plt.figure(figsize=(1.5*len(model_s_l),12))
    ax = fig.add_axes([0.10, 0.40, 0.80, 0.50])

    # Generating bar values
    value_s_l = ['train_rms_error', 'test_rms_error',
               'cross_val_train_rms_error', 'cross_val_test_rms_error',
               'three_year_train_rms_error', 'three_year_test_rms_error']
    value_l_d = {}
    for value_s in value_s_l:
        value_l_d[value_s] = [all_results_d[iter_model_s][value_s]\
                              for iter_model_s in model_s_l]

    # Generate bar positions
    bar_width = 0.12
    value_position_l_d = {}
    for i_value, value_s in enumerate(value_s_l):
        value_position_l_d[value_s] = np.arange(len(model_s_l)) + (i_value-3)*bar_width

    # Generate colors
    value_color_l = ['r', 'y', 'g', 'c', 'b', 'm']

    # Plot bars
    for i_value, value_s in enumerate(value_s_l):
        ax.bar(value_position_l_d[value_s], value_l_d[value_s], bar_width,
               color=value_color_l[i_value], label=value_s)

    # Formatting
    ax.set_title('Comparison of RMS error of autoregression algorithms vs. controls')
    ax.set_xticks(np.arange(len(model_s_l)))
    ax.set_xticklabels(model_s_l, rotation=90)
    ax.set_ylabel('Root mean squared error')
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.50))
    for chosen_baseline_s in chosen_baseline_s_l:
        ax.axhline(y=all_test_mses_d[chosen_baseline_s], color=(0.5, 0.5, 0.5))
    ax.set_ylim([0, 1.5*all_test_mses_d['z_mean_over_years_score_control']])
    save_path = os.path.join(config.plot_path, 'create_predictions')
    if not os.path.isdir(save_path):
        os.mkdir(save_path)
    plt.savefig(os.path.join(save_path,
                             'rms_error_all_models__{0}.png'.format(primary_feature_s)))


    ## Save data to the SQL database
    if save_data:
        model_to_save_s = 'raw_lag1'
        new_column_s_l = ['ENTITY_CD'] + \
            ['{0}_prediction_{1:d}'.format(primary_feature_s, year)
             for year in config.prediction_year_l]
        prediction_a = np.concatenate((index_a.reshape(-1, 1),
                                       all_results_d[model_to_save_s]['prediction_a']),
                                      axis=1)
        prediction_df = pd.DataFrame(prediction_a, columns=new_column_s_l)
        utilities.write_to_sql_table(prediction_df,
                                     '{0}_prediction'.format(primary_feature_s), 'joined')