def explore_01_box_years_marks(df_to_explore): # Transform the dataset for exploration find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK' output_df = df_to_explore[find_criteria_total].astype({'year': 'int32'}) # Make a plot, and write it to a .png file output_file_name = 'output/cleaner_marks_df_2014_01_box_years_marks.png' output_plot = { 'df_x': 'year', 'df_y': 'mark', 'title': '\'TOTAL MARK\' by years', 'x_label': 'Year', 'y_label': 'Marks' } boxplot_df_to_png(output_df, output_file_name, output_plot, (3.2, 2))
def explore_03_box_days_marks(df_to_explore): # Transform the dataset for exploration find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK' find_year_2019 = df_to_explore['year'] == 2019 choose_rows = find_criteria_total & find_year_2019 output_df = df_to_explore[choose_rows].sort_values(by=['date_dnn']) # Make a plot, and write it to a .png file output_file_name = 'output/cleaner_marks_df_2014_03_box_days_marks.png' output_plot = { 'df_x': 'date_dn', 'df_y': 'mark', 'title': '\'TOTAL MARK\' by week days for 2019', 'x_label': 'Adjudication Week Day', 'y_label': 'Marks' } boxplot_df_to_png(output_df, output_file_name, output_plot, (3.2, 2))
def explore_02_box_counties_marks(df_to_explore): # Transform the dataset for exploration find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK' find_year_2019 = df_to_explore['year'] == 2019 choose_rows = find_criteria_total & find_year_2019 output_df = df_to_explore[choose_rows].sort_values(by=['county_l1']) # Make a plot, and write it to a .png file output_file_name = 'output/cleaner_marks_df_2014_02_box_counties_marks.png' output_plot = { 'df_x': 'county_l1', 'df_y': 'mark', 'title': '\'TOTAL MARK\' by counties for 2019', 'x_label': 'County', 'y_label': 'Marks' } boxplot_df_to_png(output_df, output_file_name, output_plot, (6.6, 4))
print('\n4. Sort years by pdf_success\n') print(print_pretty_table(output_df)) # 4. Show distribution of pdf_success per county by years # > Step 1: Create a dataframe with the filtered rows / cols choose_cols = ['year', 'county', 'pdf_found', 'pdf_success'] groupby_cols = ['year', 'county'] output_df = df[choose_cols].groupby(by=groupby_cols).sum().reset_index() # > Step 2: Make a plot to show the distribution, and write it to a .png file output_file_name = 'output/crawler_pdfs_df_03_box_years.png' output_plot = {'df_x': 'year', 'df_y': 'pdf_success', 'title': 'Distribution of downloaded pdfs per county by years', 'x_label': 'Year', 'y_label': 'Downloaded pdfs per county'} boxplot_df_to_png(output_df, output_file_name, output_plot, (6.7, 3)) # 5. Pivot by county vs year, and show the outliers # > Step 1: Create a dataframe using table pivoting output_df = df.pivot_table(index=['county'], columns=['year'], values='pdf_success', aggfunc=np.sum, fill_value=0, margins=True, margins_name='(total)') # > Step 2: Add columns with 0 values for the years without any pdfs found for year in year_range: if year not in output_df: output_df[year] = 0 # > Step 3: Sort the columns output_df = output_df.rename(columns={'(total)': -1}).sort_index(axis=1) output_df = output_df.rename(columns={-1: '(total)'})