Esempio n. 1
0
    # (as of 2016)
    # See: https://en.wikipedia.org/wiki/Ballingarry,_North_Tipperary
    # See: https://en.wikipedia.org/wiki/Ballingarry,_South_Tipperary
    find_ballingarry = df['town'] == 'Ballingarry'

    find_tipperary_north = df['pdf_path_county'] == 'tipperary-north'
    df.loc[find_ballingarry & find_tipperary_north, 'town'] = 'Ballingarry (North)'

    find_tipperary_south = df['pdf_path_county'] == 'tipperary-south'
    df.loc[find_ballingarry & find_tipperary_south, 'town'] = 'Ballingarry (South)'

    # > Step 3: Make a .csv file with a list of counties / towns for further post-processing of town names
    choose_cols = ['county_l1', 'town']
    output_df = df[choose_cols].drop_duplicates().sort_values(by=choose_cols)

    write_df_to_csv(output_df, 'output/cleaner_towns_df.csv')

    # > Step 4: Read the mapping of town names from a .csv file
    towns_df = pd.read_csv('input/towns.csv')

    # > Step 5: Merge the dataframes – apply the results of manual post-processing of town names
    df = df.merge(towns_df, how='left', on=['county_l1', 'town'])

    # Check & clean 'date' column

    # > Step 1: Add a column 'date_month' with a month extracted from 'date'
    df['date_month'] = df['date'].dt.month

    # > Step 2: Check the min / max / pd.NaT dates by years
    find_criteria_total = df['criteria'] == 'TOTAL MARK'
    output_df = df[find_criteria_total].groupby('year').agg(
Esempio n. 2
0
            scenario = 0 if years_seq is None else 2
        else:
            # scenario = 1; criteria + 3 digit columns: total, previous, current
            scenario = 1
    else:
        # Try to find strings with criteria + 2 digit columns
        criteria_re = r'^\s*([^0-9\n]*?)\s{3,}(\d{1,3})\s{3,}(\d{1,3})$'
        criteria = re.findall(criteria_re, pdf_page, re.MULTILINE)
        # scenario = 3; criteria + 2 digit columns: total, current
        scenario = 0 if len(criteria) == 0 else 3

    return scenario, criteria


if __name__ == '__main__':
    # Generate a list of file paths to pdfs produced by the crawler
    file_paths = generate_file_paths('../crawler/output/pdfs')

    parser_marks_df = pd.DataFrame()

    # Iterate over the paths to get the information from pdfs
    for pdf_path in tqdm(file_paths):
        parser_marks = parse_pdf_to_marks(pdf_path)
        parser_marks['pdf_path'] = os.path.dirname(pdf_path)
        parser_marks['pdf_name'] = os.path.basename(pdf_path)
        parser_marks_df = parser_marks_df.append(parser_marks,
                                                 ignore_index=True)

    # Write the 'dirty' dataset to a .csv file
    write_df_to_csv(parser_marks_df, 'output/parser_marks_df.csv')
Esempio n. 3
0
    print('\nShow dataframe info after transformation\n')
    print(print_pretty_table(show_extended_info(df_Xy)))

    # Calculate and save the metrics for the transformed dataset
    for inx in model_scores_df.index:
        r2_total = r2_score(df_Xy[2019], df_Xy['pred_' + inx])
        model_scores_df.at[inx, 'r2_total'] = r2_total
        rmse_total = mean_squared_error(df_Xy[2019],
                                        df_Xy['pred_' + inx])**(1 / 2)
        model_scores_df.at[inx, 'rmse_total'] = rmse_total

    # Print the tuned parameters and metrics
    print('\nPrint the tuned parameters and metrics\n')
    print(print_pretty_table(model_scores_df.iloc[:, 0:5], '.4f'))
    print(print_pretty_table(model_scores_df.iloc[:, 5:], '.4f'))

    # Transform the dataframe with found regression coefficients / features importances
    model_features_df = model_features_df.T
    model_features_df.columns = X.columns.values
    model_features_df = model_features_df.reset_index()

    # Write the results to .csv files
    write_df_to_csv(model_scores_df,
                    'output/' + predictor_name + '_model_scores.csv')
    write_df_to_csv(model_features_df,
                    'output/' + predictor_name + '_model_features.csv')
    write_df_to_csv(model_results_df,
                    'output/' + predictor_name + '_model_results.csv')
    write_df_to_csv(df_Xy.sort_values(by=2019, ascending=False),
                    'output/' + predictor_name + '.csv')
Esempio n. 4
0
    # 2. Check year mismatches

    # > Step 1: Add a column 'pdf_year' with a year extracted from 'pdf_name'
    df['pdf_year'] = df['pdf_name'].apply(lambda x: int(x[0:4]))

    # > Step 2: Create a dataframe with the filtered rows / cols
    find_year_mismatch = df['pdf_year'] != df['year']
    choose_cols = ['pdf_name', 'pdf_year', 'year', 'county', 'pdf_found', 'pdf_success']
    output_df = df[find_year_mismatch][choose_cols].reset_index(drop=True)

    # > Step 3: Print the results to a console
    print('\n2. Check year mismatches\n')
    print(print_pretty_table(output_df))

    # > Step 4: Write a report on year mismatches to a .csv file
    write_df_to_csv(output_df, 'output/crawler_pdfs_df_02_year_mismatch.csv')

    # 3. Sort years by pdf_success

    # > Step 1: Create a dataframe with the filtered rows / cols
    choose_cols = ['year', 'pdf_found', 'pdf_success']
    groupby_cols = ['year']
    output_df = df[choose_cols].groupby(by=groupby_cols).sum().reset_index()
    output_df = output_df.sort_values(by=['pdf_success'])

    # > Step 2: Print the results to a console
    print('\n4. Sort years by pdf_success\n')
    print(print_pretty_table(output_df))

    # 4. Show distribution of pdf_success per county by years
Esempio n. 5
0
        time.sleep(3)
        if response.status_code == 200:
            write_pdf(response.content, file_path)
        else:
            return 0

    return 1


if __name__ == '__main__':
    year_start = 1996
    year_end = 2019
    year_range = range(year_start, year_end + 1)

    # Read the list of counties (as used on tidytowns.ie) from a .txt file
    counties_list = read_txt_splitlines('input/counties.txt')

    crawler_pdfs_df = pd.DataFrame()

    # Iterate over years and counties to download the pdfs
    for year in year_range:
        for county in counties_list:
            crawler_pdfs = pd.DataFrame(
                get_pdfs_by_year_county(str(year), county))
            crawler_pdfs['year'] = year
            crawler_pdfs['county'] = county
            crawler_pdfs_df = crawler_pdfs_df.append(crawler_pdfs)

    # Write a report with the results of crawling to a .csv file
    write_df_to_csv(crawler_pdfs_df, 'output/crawler_pdfs_df.csv')