# (as of 2016) # See: https://en.wikipedia.org/wiki/Ballingarry,_North_Tipperary # See: https://en.wikipedia.org/wiki/Ballingarry,_South_Tipperary find_ballingarry = df['town'] == 'Ballingarry' find_tipperary_north = df['pdf_path_county'] == 'tipperary-north' df.loc[find_ballingarry & find_tipperary_north, 'town'] = 'Ballingarry (North)' find_tipperary_south = df['pdf_path_county'] == 'tipperary-south' df.loc[find_ballingarry & find_tipperary_south, 'town'] = 'Ballingarry (South)' # > Step 3: Make a .csv file with a list of counties / towns for further post-processing of town names choose_cols = ['county_l1', 'town'] output_df = df[choose_cols].drop_duplicates().sort_values(by=choose_cols) write_df_to_csv(output_df, 'output/cleaner_towns_df.csv') # > Step 4: Read the mapping of town names from a .csv file towns_df = pd.read_csv('input/towns.csv') # > Step 5: Merge the dataframes – apply the results of manual post-processing of town names df = df.merge(towns_df, how='left', on=['county_l1', 'town']) # Check & clean 'date' column # > Step 1: Add a column 'date_month' with a month extracted from 'date' df['date_month'] = df['date'].dt.month # > Step 2: Check the min / max / pd.NaT dates by years find_criteria_total = df['criteria'] == 'TOTAL MARK' output_df = df[find_criteria_total].groupby('year').agg(
scenario = 0 if years_seq is None else 2 else: # scenario = 1; criteria + 3 digit columns: total, previous, current scenario = 1 else: # Try to find strings with criteria + 2 digit columns criteria_re = r'^\s*([^0-9\n]*?)\s{3,}(\d{1,3})\s{3,}(\d{1,3})$' criteria = re.findall(criteria_re, pdf_page, re.MULTILINE) # scenario = 3; criteria + 2 digit columns: total, current scenario = 0 if len(criteria) == 0 else 3 return scenario, criteria if __name__ == '__main__': # Generate a list of file paths to pdfs produced by the crawler file_paths = generate_file_paths('../crawler/output/pdfs') parser_marks_df = pd.DataFrame() # Iterate over the paths to get the information from pdfs for pdf_path in tqdm(file_paths): parser_marks = parse_pdf_to_marks(pdf_path) parser_marks['pdf_path'] = os.path.dirname(pdf_path) parser_marks['pdf_name'] = os.path.basename(pdf_path) parser_marks_df = parser_marks_df.append(parser_marks, ignore_index=True) # Write the 'dirty' dataset to a .csv file write_df_to_csv(parser_marks_df, 'output/parser_marks_df.csv')
print('\nShow dataframe info after transformation\n') print(print_pretty_table(show_extended_info(df_Xy))) # Calculate and save the metrics for the transformed dataset for inx in model_scores_df.index: r2_total = r2_score(df_Xy[2019], df_Xy['pred_' + inx]) model_scores_df.at[inx, 'r2_total'] = r2_total rmse_total = mean_squared_error(df_Xy[2019], df_Xy['pred_' + inx])**(1 / 2) model_scores_df.at[inx, 'rmse_total'] = rmse_total # Print the tuned parameters and metrics print('\nPrint the tuned parameters and metrics\n') print(print_pretty_table(model_scores_df.iloc[:, 0:5], '.4f')) print(print_pretty_table(model_scores_df.iloc[:, 5:], '.4f')) # Transform the dataframe with found regression coefficients / features importances model_features_df = model_features_df.T model_features_df.columns = X.columns.values model_features_df = model_features_df.reset_index() # Write the results to .csv files write_df_to_csv(model_scores_df, 'output/' + predictor_name + '_model_scores.csv') write_df_to_csv(model_features_df, 'output/' + predictor_name + '_model_features.csv') write_df_to_csv(model_results_df, 'output/' + predictor_name + '_model_results.csv') write_df_to_csv(df_Xy.sort_values(by=2019, ascending=False), 'output/' + predictor_name + '.csv')
# 2. Check year mismatches # > Step 1: Add a column 'pdf_year' with a year extracted from 'pdf_name' df['pdf_year'] = df['pdf_name'].apply(lambda x: int(x[0:4])) # > Step 2: Create a dataframe with the filtered rows / cols find_year_mismatch = df['pdf_year'] != df['year'] choose_cols = ['pdf_name', 'pdf_year', 'year', 'county', 'pdf_found', 'pdf_success'] output_df = df[find_year_mismatch][choose_cols].reset_index(drop=True) # > Step 3: Print the results to a console print('\n2. Check year mismatches\n') print(print_pretty_table(output_df)) # > Step 4: Write a report on year mismatches to a .csv file write_df_to_csv(output_df, 'output/crawler_pdfs_df_02_year_mismatch.csv') # 3. Sort years by pdf_success # > Step 1: Create a dataframe with the filtered rows / cols choose_cols = ['year', 'pdf_found', 'pdf_success'] groupby_cols = ['year'] output_df = df[choose_cols].groupby(by=groupby_cols).sum().reset_index() output_df = output_df.sort_values(by=['pdf_success']) # > Step 2: Print the results to a console print('\n4. Sort years by pdf_success\n') print(print_pretty_table(output_df)) # 4. Show distribution of pdf_success per county by years
time.sleep(3) if response.status_code == 200: write_pdf(response.content, file_path) else: return 0 return 1 if __name__ == '__main__': year_start = 1996 year_end = 2019 year_range = range(year_start, year_end + 1) # Read the list of counties (as used on tidytowns.ie) from a .txt file counties_list = read_txt_splitlines('input/counties.txt') crawler_pdfs_df = pd.DataFrame() # Iterate over years and counties to download the pdfs for year in year_range: for county in counties_list: crawler_pdfs = pd.DataFrame( get_pdfs_by_year_county(str(year), county)) crawler_pdfs['year'] = year crawler_pdfs['county'] = county crawler_pdfs_df = crawler_pdfs_df.append(crawler_pdfs) # Write a report with the results of crawling to a .csv file write_df_to_csv(crawler_pdfs_df, 'output/crawler_pdfs_df.csv')