def extract(self, year): print('Creating {0} for year {1:d}'.format(self.new_table_s, year)) df = pd.read_excel(os.path.join(config.data_path, 'budgets', self.orig_table_s_d[year] + '.xlsx')) filtered_df = df[df['Aid Category'] == 'Sum of Above Aid Categories'] trimmed_df = filtered_df.loc[:, ['BEDS Code', self.column_s[year]]] trimmed_df.columns = ['district_{0}'.format(year), '{1}_{0:d}'.format(year, self.new_table_s)] final_df = trimmed_df.set_index('district_{0}'.format(year)) utilities.write_to_sql_table(final_df, 'temp{0:d}_final'.format(year), 'temp')
def extract(self, year): print('Creating {0} for year {1:d}'.format(self.new_table_s, year)) df = pd.read_excel( os.path.join(config.data_path, 'budgets', self.orig_table_s_d[year] + '.xlsx')) filtered_df = df[df['Aid Category'] == 'Sum of Above Aid Categories'] trimmed_df = filtered_df.loc[:, ['BEDS Code', self.column_s[year]]] trimmed_df.columns = [ 'district_{0}'.format(year), '{1}_{0:d}'.format(year, self.new_table_s) ] final_df = trimmed_df.set_index('district_{0}'.format(year)) utilities.write_to_sql_table(final_df, 'temp{0:d}_final'.format(year), 'temp')
def predict_a_feature(input_data_a_d, primary_feature_s, aux_features=True, save_data=False, **kwargs): """ Wraps around fit_and_predict: runs various regression models on the input school statistic and outputs/plots the results """ print('\n\nStarting prediction for {0}.\n'.format(primary_feature_s)) with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f: f.write('\n\nStarting prediction for {0}.\n'.format(primary_feature_s)) data_a_d = input_data_a_d.copy() index_a = data_a_d[primary_feature_s][:, 0] ## Drop the ENTITY_CD column for feature_s in data_a_d.iterkeys(): data_a_d[feature_s] = data_a_d[feature_s][:, 1:] ## Split data main_data_a = data_a_d[primary_feature_s] data_a_d.pop(primary_feature_s) feature_s_l = sorted(data_a_d.keys()) if not aux_features: data_a_d = {} feature_s_l = [] ## Run regression models, validate and predict future scores, and run controls all_results_d = {} # Run autoregression with different lags on raw test scores lag_l = range(1, 5) for lag in lag_l: model_s = 'raw_lag{:d}'.format(lag) print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression, primary_feature_s, model_s, aux_data_a_d=data_a_d, diff=False, feature_s_l=feature_s_l, lag=lag, **kwargs) # # Run autogression with different lags on diff of test scores w.r.t. year # lag_l = range(1, 4) # for lag in lag_l: # model_s = 'diff_lag{:d}'.format(lag) # print(model_s + ':') # all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression, # primary_feature_s, # model_s, # aux_data_a_d=data_a_d, # diff=True, # feature_s_l=feature_s_l, # lag=lag, # **kwargs) # lag_l = range(1, 5) # for lag in lag_l: # model_s = 'ind_raw_lag{:d}'.format(lag) # print(model_s + ':') # all_results_d[model_s] = fit_and_predict(main_data_a, IndependentAutoRegression, # primary_feature_s, # model_s, # lag=lag, # **kwargs) # Run control: prediction is same as mean over years in training set model_s = 'z_mean_over_years_score_control' print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, MeanOverYears, primary_feature_s, model_s, **kwargs) # Run control: prediction is same as previous year's data model_s = 'z_same_as_last_year_score_control' print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, SameAsLastYear, primary_feature_s, model_s, **kwargs) # Run control: prediction is same as previous year's data model_s = 'z_same_change_as_last_year_score_control' print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, SameChangeAsLastYear, primary_feature_s, model_s, **kwargs) chosen_baseline_s_l = [ 'z_mean_over_years_score_control', 'z_same_as_last_year_score_control' ] all_train_mses_d = { key: value['cross_val_train_rms_error'] for (key, value) in all_results_d.iteritems() } all_test_mses_d = { key: value['cross_val_test_rms_error'] for (key, value) in all_results_d.iteritems() } all_models_s_l = sorted(all_results_d.keys()) with open(os.path.join(config.plot_path, 'RMSE_list.txt'), 'a') as f: f.write('\n\n{0}:\n'.format(primary_feature_s)) for model_s in all_models_s_l: f.write('\n{0}:\n'.format(model_s)) f.write('\tTrain RMSE:\n\t\t{:1.5f}\n'.format( all_train_mses_d[model_s])) f.write('\tTest RMSE:\n\t\t{:1.5f}\n'.format( all_test_mses_d[model_s])) for chosen_baseline_s in chosen_baseline_s_l: f.write('\t{0}: \n\t\t{1:1.5g}\n'.format( chosen_baseline_s, all_test_mses_d[model_s] / all_test_mses_d[chosen_baseline_s])) ## Plot MSEs of all regression models model_s_l = sorted(all_train_mses_d.keys()) fig = plt.figure(figsize=(1.5 * len(model_s_l), 12)) ax = fig.add_axes([0.10, 0.40, 0.80, 0.50]) # Generating bar values value_s_l = [ 'train_rms_error', 'test_rms_error', 'cross_val_train_rms_error', 'cross_val_test_rms_error', 'three_year_train_rms_error', 'three_year_test_rms_error' ] value_l_d = {} for value_s in value_s_l: value_l_d[value_s] = [all_results_d[iter_model_s][value_s]\ for iter_model_s in model_s_l] # Generate bar positions bar_width = 0.12 value_position_l_d = {} for i_value, value_s in enumerate(value_s_l): value_position_l_d[value_s] = np.arange( len(model_s_l)) + (i_value - 3) * bar_width # Generate colors value_color_l = ['r', 'y', 'g', 'c', 'b', 'm'] # Plot bars for i_value, value_s in enumerate(value_s_l): ax.bar(value_position_l_d[value_s], value_l_d[value_s], bar_width, color=value_color_l[i_value], label=value_s) # Formatting ax.set_title( 'Comparison of RMS error of autoregression algorithms vs. controls') ax.set_xticks(np.arange(len(model_s_l))) ax.set_xticklabels(model_s_l, rotation=90) ax.set_ylabel('Root mean squared error') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.50)) for chosen_baseline_s in chosen_baseline_s_l: ax.axhline(y=all_test_mses_d[chosen_baseline_s], color=(0.5, 0.5, 0.5)) ax.set_ylim([0, 1.5 * all_test_mses_d['z_mean_over_years_score_control']]) save_path = os.path.join(config.plot_path, 'create_predictions') if not os.path.isdir(save_path): os.mkdir(save_path) plt.savefig( os.path.join( save_path, 'rms_error_all_models__{0}.png'.format(primary_feature_s))) ## Save data to the SQL database if save_data: model_to_save_s = 'raw_lag1' new_column_s_l = ['ENTITY_CD'] + \ ['{0}_prediction_{1:d}'.format(primary_feature_s, year) for year in config.prediction_year_l] prediction_a = np.concatenate((index_a.reshape( -1, 1), all_results_d[model_to_save_s]['prediction_a']), axis=1) prediction_df = pd.DataFrame(prediction_a, columns=new_column_s_l) utilities.write_to_sql_table( prediction_df, '{0}_prediction'.format(primary_feature_s), 'joined')
def predict_a_feature(input_data_a_d, primary_feature_s, aux_features=True, save_data=False, **kwargs): """ Wraps around fit_and_predict: runs various regression models on the input school statistic and outputs/plots the results """ print('\n\nStarting prediction for {0}.\n'.format(primary_feature_s)) with open(os.path.join(config.plot_path, 'coeff_list.txt'), 'a') as f: f.write('\n\nStarting prediction for {0}.\n'.format(primary_feature_s)) data_a_d = input_data_a_d.copy() index_a = data_a_d[primary_feature_s][:, 0] ## Drop the ENTITY_CD column for feature_s in data_a_d.iterkeys(): data_a_d[feature_s] = data_a_d[feature_s][:, 1:] ## Split data main_data_a = data_a_d[primary_feature_s] data_a_d.pop(primary_feature_s) feature_s_l = sorted(data_a_d.keys()) if not aux_features: data_a_d = {} feature_s_l = [] ## Run regression models, validate and predict future scores, and run controls all_results_d = {} # Run autoregression with different lags on raw test scores lag_l = range(1, 5) for lag in lag_l: model_s = 'raw_lag{:d}'.format(lag) print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression, primary_feature_s, model_s, aux_data_a_d=data_a_d, diff=False, feature_s_l=feature_s_l, lag=lag, **kwargs) # # Run autogression with different lags on diff of test scores w.r.t. year # lag_l = range(1, 4) # for lag in lag_l: # model_s = 'diff_lag{:d}'.format(lag) # print(model_s + ':') # all_results_d[model_s] = fit_and_predict(main_data_a, AutoRegression, # primary_feature_s, # model_s, # aux_data_a_d=data_a_d, # diff=True, # feature_s_l=feature_s_l, # lag=lag, # **kwargs) # lag_l = range(1, 5) # for lag in lag_l: # model_s = 'ind_raw_lag{:d}'.format(lag) # print(model_s + ':') # all_results_d[model_s] = fit_and_predict(main_data_a, IndependentAutoRegression, # primary_feature_s, # model_s, # lag=lag, # **kwargs) # Run control: prediction is same as mean over years in training set model_s = 'z_mean_over_years_score_control' print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, MeanOverYears, primary_feature_s, model_s, **kwargs) # Run control: prediction is same as previous year's data model_s = 'z_same_as_last_year_score_control' print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, SameAsLastYear, primary_feature_s, model_s, **kwargs) # Run control: prediction is same as previous year's data model_s = 'z_same_change_as_last_year_score_control' print(model_s + ':') all_results_d[model_s] = fit_and_predict(main_data_a, SameChangeAsLastYear, primary_feature_s, model_s, **kwargs) chosen_baseline_s_l = ['z_mean_over_years_score_control', 'z_same_as_last_year_score_control'] all_train_mses_d = {key: value['cross_val_train_rms_error'] for (key, value) in all_results_d.iteritems()} all_test_mses_d = {key: value['cross_val_test_rms_error'] for (key, value) in all_results_d.iteritems()} all_models_s_l = sorted(all_results_d.keys()) with open(os.path.join(config.plot_path, 'RMSE_list.txt'), 'a') as f: f.write('\n\n{0}:\n'.format(primary_feature_s)) for model_s in all_models_s_l: f.write('\n{0}:\n'.format(model_s)) f.write('\tTrain RMSE:\n\t\t{:1.5f}\n'.format(all_train_mses_d[model_s])) f.write('\tTest RMSE:\n\t\t{:1.5f}\n'.format(all_test_mses_d[model_s])) for chosen_baseline_s in chosen_baseline_s_l: f.write('\t{0}: \n\t\t{1:1.5g}\n'.format(chosen_baseline_s, all_test_mses_d[model_s]/all_test_mses_d[chosen_baseline_s])) ## Plot MSEs of all regression models model_s_l = sorted(all_train_mses_d.keys()) fig = plt.figure(figsize=(1.5*len(model_s_l),12)) ax = fig.add_axes([0.10, 0.40, 0.80, 0.50]) # Generating bar values value_s_l = ['train_rms_error', 'test_rms_error', 'cross_val_train_rms_error', 'cross_val_test_rms_error', 'three_year_train_rms_error', 'three_year_test_rms_error'] value_l_d = {} for value_s in value_s_l: value_l_d[value_s] = [all_results_d[iter_model_s][value_s]\ for iter_model_s in model_s_l] # Generate bar positions bar_width = 0.12 value_position_l_d = {} for i_value, value_s in enumerate(value_s_l): value_position_l_d[value_s] = np.arange(len(model_s_l)) + (i_value-3)*bar_width # Generate colors value_color_l = ['r', 'y', 'g', 'c', 'b', 'm'] # Plot bars for i_value, value_s in enumerate(value_s_l): ax.bar(value_position_l_d[value_s], value_l_d[value_s], bar_width, color=value_color_l[i_value], label=value_s) # Formatting ax.set_title('Comparison of RMS error of autoregression algorithms vs. controls') ax.set_xticks(np.arange(len(model_s_l))) ax.set_xticklabels(model_s_l, rotation=90) ax.set_ylabel('Root mean squared error') ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.50)) for chosen_baseline_s in chosen_baseline_s_l: ax.axhline(y=all_test_mses_d[chosen_baseline_s], color=(0.5, 0.5, 0.5)) ax.set_ylim([0, 1.5*all_test_mses_d['z_mean_over_years_score_control']]) save_path = os.path.join(config.plot_path, 'create_predictions') if not os.path.isdir(save_path): os.mkdir(save_path) plt.savefig(os.path.join(save_path, 'rms_error_all_models__{0}.png'.format(primary_feature_s))) ## Save data to the SQL database if save_data: model_to_save_s = 'raw_lag1' new_column_s_l = ['ENTITY_CD'] + \ ['{0}_prediction_{1:d}'.format(primary_feature_s, year) for year in config.prediction_year_l] prediction_a = np.concatenate((index_a.reshape(-1, 1), all_results_d[model_to_save_s]['prediction_a']), axis=1) prediction_df = pd.DataFrame(prediction_a, columns=new_column_s_l) utilities.write_to_sql_table(prediction_df, '{0}_prediction'.format(primary_feature_s), 'joined')