コード例 #1
0
def explore_01_box_years_marks(df_to_explore):
    # Transform the dataset for exploration
    find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK'
    output_df = df_to_explore[find_criteria_total].astype({'year': 'int32'})

    # Make a plot, and write it to a .png file
    output_file_name = 'output/cleaner_marks_df_2014_01_box_years_marks.png'
    output_plot = {
        'df_x': 'year',
        'df_y': 'mark',
        'title': '\'TOTAL MARK\' by years',
        'x_label': 'Year',
        'y_label': 'Marks'
    }
    boxplot_df_to_png(output_df, output_file_name, output_plot, (3.2, 2))
コード例 #2
0
def explore_03_box_days_marks(df_to_explore):
    # Transform the dataset for exploration
    find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK'
    find_year_2019 = df_to_explore['year'] == 2019
    choose_rows = find_criteria_total & find_year_2019
    output_df = df_to_explore[choose_rows].sort_values(by=['date_dnn'])

    # Make a plot, and write it to a .png file
    output_file_name = 'output/cleaner_marks_df_2014_03_box_days_marks.png'
    output_plot = {
        'df_x': 'date_dn',
        'df_y': 'mark',
        'title': '\'TOTAL MARK\' by week days for 2019',
        'x_label': 'Adjudication Week Day',
        'y_label': 'Marks'
    }
    boxplot_df_to_png(output_df, output_file_name, output_plot, (3.2, 2))
コード例 #3
0
def explore_02_box_counties_marks(df_to_explore):
    # Transform the dataset for exploration
    find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK'
    find_year_2019 = df_to_explore['year'] == 2019
    choose_rows = find_criteria_total & find_year_2019
    output_df = df_to_explore[choose_rows].sort_values(by=['county_l1'])

    # Make a plot, and write it to a .png file
    output_file_name = 'output/cleaner_marks_df_2014_02_box_counties_marks.png'
    output_plot = {
        'df_x': 'county_l1',
        'df_y': 'mark',
        'title': '\'TOTAL MARK\' by counties for 2019',
        'x_label': 'County',
        'y_label': 'Marks'
    }
    boxplot_df_to_png(output_df, output_file_name, output_plot, (6.6, 4))
コード例 #4
0
    print('\n4. Sort years by pdf_success\n')
    print(print_pretty_table(output_df))

    # 4. Show distribution of pdf_success per county by years

    # > Step 1: Create a dataframe with the filtered rows / cols
    choose_cols = ['year', 'county', 'pdf_found', 'pdf_success']
    groupby_cols = ['year', 'county']
    output_df = df[choose_cols].groupby(by=groupby_cols).sum().reset_index()

    # > Step 2: Make a plot to show the distribution, and write it to a .png file
    output_file_name = 'output/crawler_pdfs_df_03_box_years.png'
    output_plot = {'df_x': 'year', 'df_y': 'pdf_success',
                   'title': 'Distribution of downloaded pdfs per county by years',
                   'x_label': 'Year', 'y_label': 'Downloaded pdfs per county'}
    boxplot_df_to_png(output_df, output_file_name, output_plot, (6.7, 3))

    # 5. Pivot by county vs year, and show the outliers

    # > Step 1: Create a dataframe using table pivoting
    output_df = df.pivot_table(index=['county'], columns=['year'], values='pdf_success',
                               aggfunc=np.sum, fill_value=0, margins=True, margins_name='(total)')

    # > Step 2: Add columns with 0 values for the years without any pdfs found
    for year in year_range:
        if year not in output_df:
            output_df[year] = 0

    # > Step 3: Sort the columns
    output_df = output_df.rename(columns={'(total)': -1}).sort_index(axis=1)
    output_df = output_df.rename(columns={-1: '(total)'})