def explore_06_heat_weeks_days(df_to_explore): # Create a dictionary for mapping week day numbers into week day names wd_names = {0: 'Mo', 1: 'Tu', 2: 'We', 3: 'Th', 4: 'Fr', 5: 'Sa', 6: 'Su'} # Transform the dataset for exploration find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK' find_year_2019 = df_to_explore['year'] == 2019 choose_rows = find_criteria_total & find_year_2019 output_df = df_to_explore[choose_rows].pivot_table( index=['date_week'], columns=['date_dnn'], values='mark', aggfunc=np.count_nonzero, fill_value=np.nan) output_df = output_df.rename(columns=wd_names) # View the first 5 rows after transformation print('\n6. View the first 5 rows after transformation\n') print(print_pretty_table(output_df.head())) # Make a plot, and write it to a .png file output_file_name = 'output/cleaner_marks_df_2014_06_heatmap_weeks_days.png' output_plot = { 'title': 'Adjudicated towns by weeks / week days for 2019', 'x_label': 'Week Day', 'y_label': 'Week of Year' } heatmap_df_to_png(output_df, output_file_name, output_plot, (6.6, 3))
def explore_07_heat_criteria_categories(df_to_explore): # Transform the dataset for exploration find_criteria_total = df_to_explore['criteria_tidy'] == 'TOTAL MARK' find_year_2019 = df_to_explore['year'] == 2019 choose_rows = ~find_criteria_total & find_year_2019 output_df = df_to_explore[choose_rows].pivot_table(index=['criteria_tidy'], columns=['category'], values='mark', aggfunc=np.mean, fill_value=0) # View the first 5 rows after transformation print('\n7. View the first 5 rows after transformation\n') print(print_pretty_table(output_df.head())) # Make a plot, and write it to a .png file output_file_name = 'output/cleaner_marks_df_2014_07_heat_criteria_categories.png' output_plot = { 'title': 'Mean marks by criteria / categories / 2019', 'x_label': 'Category', 'y_label': 'Criteria' } heatmap_df_to_png(output_df, output_file_name, output_plot, (6.6, 3))
import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.linear_model import Lasso from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_error if __name__ == '__main__': df = pd.read_csv('../cleaner/output/cleaner_marks_df_2014.csv') # View the first 5 rows print('\nView the first 5 rows\n') print(print_pretty_table(df.head())) # Transform the dataset for predicting # Model 1: Predict 'TOTAL MARK' using only 'TOTAL MARK' dynamics by years for non-nan rows find_total_mark = df['criteria_tidy'] == 'TOTAL MARK' choose_cols = ['town_tidy', 'county_l1', 'criteria_tidy'] df = df[find_total_mark].pivot_table(index=choose_cols, columns=['year'], values='mark', aggfunc=np.sum, fill_value=np.nan).reset_index() df = df.dropna() # View the first 5 rows after transformation print('\nView the first 5 rows after transformation\n') print(print_pretty_table(df.head()))
from functions import write_df_to_csv, print_pretty_table, show_extended_info import pandas as pd import numpy as np if __name__ == '__main__': # Read the 'dirty' dataset from a .csv file df = pd.read_csv('../parser/output/parser_marks_df.csv', parse_dates=['date'], dtype={'mark': np.float64, 'max_mark': np.float64, 'year': np.float64}) # Show dataframe info print('\nShow dataframe info\n') print(print_pretty_table(show_extended_info(df))) # Clean 'category' column # > Step 1: Read the mapping of population categories to National Awards categories from a .csv file categories_df = pd.read_csv('input/categories.csv') # > Step 2: Merge the dataframes – enrich the dataset with a ‘category_tidy’ column df = df.merge(categories_df, how='left', on=['category']) # Clean 'county' column df['county_l1'] = df['county'].apply(lambda x: x.split(' ')[0]) # Check & clean 'town' column # > Step 1: Add a column 'pdf_path_county' with a county extracted from 'pdf_path' df['pdf_path_county'] = df['pdf_path'].apply(lambda x: x.split('/')[-1]) # > Step 2: Process 'Ballingarry' to differentiate Ballingarry (North Tipperary) -- a civil parish
from functions import write_df_to_csv, print_pretty_table, show_extended_info, boxplot_df_to_png import pandas as pd import numpy as np if __name__ == '__main__': year_start = 1996 year_end = 2019 year_range = range(year_start, year_end + 1) # Read the report with the results of crawling from a .csv file df = pd.read_csv('output/crawler_pdfs_df.csv') # Show dataframe info print('\nShow dataframe info\n') print(print_pretty_table(show_extended_info(df))) # View the first 5 rows print('\nView the first 5 rows\n') print(print_pretty_table(df.head())) # 1. Check crawler stats # > Step 1: Check the number of pdfs crawled output_pt = ([['dataframe', 'shape'], ['pdfs crawled', df.shape]]) # > Step 2: Check the number of pdfs downloaded / available for download find_pdf_success = df['pdf_success'] == 1 output_df = df[find_pdf_success] output_pt.append(['pdfs downloaded', output_df.shape]) # > Step 3: Check the number of pdfs to parse
from sklearn.ensemble import AdaBoostRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.model_selection import GridSearchCV from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error if __name__ == '__main__': SEED = 42 predictor_name = 'predictor_02' # Read the clean dataset from a .csv file df = pd.read_csv('../cleaner/output/cleaner_marks_df_2014.csv') # View the first 5 rows print('\nView the first 5 rows\n') print(print_pretty_table(df.head())) # Approach 2: Predict non-'TOTAL MARK' using non-'TOTAL MARK' dynamics by years # Transform the dataset for predicting find_criteria_total = df['criteria_tidy'] == 'TOTAL MARK' choose_rows = ~find_criteria_total choose_cols = ['category_tidy', 'county_l1', 'town_tidy', 'criteria_tidy'] df_Xy = df[choose_rows].pivot_table(index=choose_cols, columns=['year'], values='mark', aggfunc=np.sum, fill_value=np.nan) df_Xy = df_Xy.reset_index().dropna() # Show dataframe info after transformation
import pandas as pd import numpy as np if __name__ == '__main__': # Read the 'dirty' dataset from a .csv file df = pd.read_csv('output/parser_marks_df.csv', parse_dates=['date'], dtype={ 'mark': np.float64, 'max_mark': np.float64, 'year': np.float64 }) # Show dataframe info print('\nShow dataframe info\n') print(print_pretty_table(show_extended_info(df))) # View the first 5 rows print('\nView the first 5 rows\n') print(print_pretty_table(df.iloc[:, 0:8].head())) print(print_pretty_table(df.iloc[:, 8:].head())) # Check unique values in 'category', 'county', 'criteria' choose_cols = ['category', 'county', 'criteria'] output_pt = [['column', 'unique values']] for col in choose_cols: show_col_unique = df[col].unique() show_col_unique = sorted(show_col_unique) show_col_unique = print_pretty_list(show_col_unique) output_pt.append([col, show_col_unique])
heatmap_df_to_png(output_df, output_file_name, output_plot, (6.6, 3)) if __name__ == '__main__': # Read the clean dataset from a .csv file df = pd.read_csv('../cleaner/output/cleaner_marks_df_2014.csv', parse_dates=['date'], dtype={ 'mark': np.float64, 'max_mark': np.float64, 'year': np.float64 }) # Show dataframe info print('\nShow dataframe info\n') print(print_pretty_table(show_extended_info(df))) # View the first 5 rows print('\nView the first 5 rows\n') print(print_pretty_table(df.head())) # 1. Explore 'TOTAL MARK' distribution dynamics by years explore_01_box_years_marks(df) # 2. Explore 'TOTAL MARK' distribution for 2019 by counties explore_02_box_counties_marks(df) explore_02_bar_counties_count(df) # Add a column 'date_dnn' with a week day number extracted from 'date' df['date_dnn'] = df['date'].dt.weekday