candidate = None while candidate != 'Write-in': candidate = self._process_candidate(candidate) if candidate in VALID_HEADERS: yield candidate candidate = None if candidate == 'Write-in': yield candidate def _process_candidate(self, candidate): s = next(self._string_iterator) if not candidate: return s return candidate + ' ' + s def pdf_to_csv(pdf, csv_writer): csv_writer.writeheader() for page in pdf: print(f'processing page {page.get_page_number()}') pdf_page_parser = LycomingPDFPageParser(page) if not pdf_page_parser.skip_page(): for row in pdf_page_parser: csv_writer.writerow(row) if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(LYCOMING_FILE), csv.DictWriter(f, OUTPUT_HEADER))
self._table_body_parser = BerksPDFTableBodyParser( self._string_iterator, candidates) yield from iter(self._table_body_parser) def _parse_header(self): header = [ next(self._string_iterator) for _ in range(len(BERKS_HEADER)) ] assert (header == BERKS_HEADER) def page_is_done(self): if self._string_iterator.peek().startswith(FIRST_FOOTER_SUBSTRING): return True return self._table_body_parser and self._table_body_parser.page_is_done( ) def pdf_to_csv(pdf, csv_writer): csv_writer.writeheader() for page in pdf: print(f'processing page {page.get_page_number()}') pdf_page_parser = BerksPDFPageParser(page) for row in pdf_page_parser: csv_writer.writerow(row._asdict()) if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(BERKS_FILE), csv.DictWriter(f, OUTPUT_HEADER))
_county = COUNTY _expected_table_headers = EXPECTED_TABLE_HEADERS _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if 'Del ' in row['office']: return False if 'Comm' in row['office']: return False return super()._should_be_recorded(row) class BeaverPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = BeaverPDFStringIterator _pdf_table_parser_clazz = BeaverPDFTableParser _header = BEAVER_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(BEAVER_FILE), csv.DictWriter(f, OUTPUT_HEADER), BeaverPDFPageParser)
row = self._generate_row(candidate_data, category_votes) office_is_invalid = max(invalid_office in row.office for invalid_office in INVALID_OFFICES) if not office_is_invalid: yield row def _generate_row(self, candidate_data, category_votes): row_data = [COUNTY, self._jurisdiction] + list(candidate_data) if candidate_data.office == 'Registered Voters': assert (min(category_votes) == max(category_votes)) row_data += [''] * len(VOTE_CATEGORIES) + [category_votes[0]] else: row_data += category_votes + [sum(category_votes)] return ParsedRow(*row_data) def _parse_category_cell(self, category): new_category = next(self._string_iterator).strip() assert (category == new_category) class BradfordPDFPageParser(PDFPageParser): _standard_header = BRADFORD_HEADER _table_header_parser = BradfordTableHeaderParser _table_body_parser = BradfordTableBodyParser if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(BRADFORD_FILE), csv.DictWriter(f, OUTPUT_HEADER), BradfordPDFPageParser)
] assert header == INDIANA_HEADER def _process_table_headers(self): table_header_parser = IndianaPDFTableHeaderParser( self._string_iterator, self._previous_table_header) self._table_headers = table_header_parser.get_table_headers() return table_header_parser.get_candidates() def process_table_body(self, candidates): self._table_body_parser = IndianaPDFTableBodyParser( self._string_iterator, candidates) yield from self._table_body_parser def pdf_to_csv(pdf, csv_writer): previous_table_header = None csv_writer.writeheader() for page in pdf: print(f'processing page {page.get_page_number()}') pdf_page_parser = IndianaPDFPageParser(page, previous_table_header) for row in pdf_page_parser: csv_writer.writerow(row) previous_table_header = pdf_page_parser.get_continued_table_header() if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(INDIANA_FILE), csv.DictWriter(f, OUTPUT_HEADER))
def pdfs_to_csv(csv_writer): csv_writer.writeheader() for party in CUMBERLAND_PARTIES: cumberland_file = CUMBERLAND_FILE_FORMAT.format(party) pdf_page_iterator = PDFPageIterator(cumberland_file) append_pdf_to_csv(pdf_page_iterator, csv_writer, party)
@classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if 'Delegate' in row['office']: return False if 'Committee' in row['office']: return False if 'Liquor' in row['office']: return False if 'Council' in row['office']: return False return super()._should_be_recorded(row) class ChesterPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = ChesterPDFStringIterator _pdf_table_parser_clazz = ChesterPDFTableParser _header = CHESTER_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(CHESTER_FILE), csv.DictWriter(f, OUTPUT_HEADER), ChesterPDFPageParser)
class CambriaPDFStringIterator(ElectionwarePDFStringIterator): _first_footer_substring = FIRST_FOOTER_SUBSTRING _second_footer_substring = SECOND_FOOTER_SUBSTRING class CambriaPDFTableParser(ElectionwarePDFTableParser): _county = COUNTY _expected_table_headers = EXPECTED_TABLE_HEADERS _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() class CambriaPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = CambriaPDFStringIterator _pdf_table_parser_clazz = CambriaPDFTableParser _header = CAMBRIA_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(CAMBRIA_FILE), csv.DictWriter(f, OUTPUT_HEADER), CambriaPDFPageParser)
@classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if row['candidate'].startswith('Write-In: '): # there's already a Write-In Totals field; this prevents double counting return False if 'Delegate' in row['office']: return False if 'Comm' in row['office']: return False return super()._should_be_recorded(row) class ClearfieldPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = ClearfieldPDFStringIterator _pdf_table_parser_clazz = ClearfieldPDFTableParser _header = CLEARFIELD_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(CLEARFIELD_FILE), csv.DictWriter(f, OUTPUT_HEADER), ClearfieldPDFPageParser)
@staticmethod def _row_is_valid(row): if 'Delegate' in row['office']: return False if 'COMMITTEE' in row['office']: return False if row['candidate'] == LAST_ROW_CANDIDATE: return False return True def pdf_to_csv(pdf, csv_writer): csv_writer.writeheader() previous_table_header = None previous_precinct = None for page in pdf: print(f'processing page {page.get_page_number()}') pdf_page_parser = PerryPDFPageParser(page, previous_table_header, previous_precinct) for row in pdf_page_parser: csv_writer.writerow(row) previous_table_header = pdf_page_parser.get_continued_table_header() previous_precinct = pdf_page_parser.get_continued_precinct() if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(PERRY_FILE), csv.DictWriter(f, OUTPUT_HEADER))
@classmethod def _clean_row(cls, row): row['candidate'] = row['candidate'].replace('REPUBLICIAN', 'REPUBLICAN') super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if row['candidate'].startswith('Write-In: '): # there's already a Write-In Totals field; this prevents double counting return False if 'Del ' in row['office']: return False if 'Cmte' in row['office']: return False return super()._should_be_recorded(row) class ClintonPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = ClintonPDFStringIterator _pdf_table_parser_clazz = ClintonPDFTableParser _header = CLINTON_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(CLINTON_FILE), csv.DictWriter(f, OUTPUT_HEADER), ClintonPDFPageParser)
@classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() @classmethod def _should_be_recorded(cls, row): if row['candidate'].startswith('Write-In: '): # there's already a Write-In Totals field; this prevents double counting return False return super()._should_be_recorded(row) class TiogaPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = TiogaPDFStringIterator _pdf_table_parser_clazz = TiogaPDFTableParser _header = TIOGA_HEADER def __init__(self, page): super().__init__(page) if page.get_page_number() < FIRST_PER_PRECINCT_PAGE: # skip these pages; these are the summary pages strings = [FIRST_FOOTER_SUBSTRING] self._string_iterator = TiogaPDFStringIterator(strings) if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(TIOGA_FILE), csv.DictWriter(f, OUTPUT_HEADER), TiogaPDFPageParser)
def _populate_votes(self, row): super()._populate_votes(row) if '%' in self._string_iterator.peek(): next(self._string_iterator) # vote % string, not always supplied @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if 'Delegate' in row['office']: return False return super()._should_be_recorded(row) class CentrePDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = CentrePDFStringIterator _pdf_table_parser_clazz = CentrePDFTableParser _header = CENTRE_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(CENTRE_FILE), csv.DictWriter(f, OUTPUT_HEADER), CentrePDFPageParser)
_county = COUNTY _expected_table_headers = EXPECTED_TABLE_HEADERS _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].replace('Write-In: ', '').title() @classmethod def _should_be_recorded(cls, row): if not super()._should_be_recorded(row): return False if row['office'] == 'Wheatland Home Rule': return False return True class MercerPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = MercerPDFStringIterator _pdf_table_parser_clazz = MercerPDFTableParser _header = MERCER_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(MERCER_FILE), csv.DictWriter(f, OUTPUT_HEADER), MercerPDFPageParser)
_raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if row['candidate'].startswith('Write-In: '): # there's already a Write-In Totals field; this prevents double counting return False if 'Delegate' in row['office']: return False if 'Comm' in row['office']: return False return super()._should_be_recorded(row) class MifflinPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = MifflinPDFStringIterator _pdf_table_parser_clazz = MifflinPDFTableParser _header = MIFFLIN_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(MIFFLIN_FILE), csv.DictWriter(f, OUTPUT_HEADER), MifflinPDFPageParser)
next(self._string_iterator) @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if row['candidate'].startswith('Write-In: '): # there's already a Write-In Totals field; this prevents double counting return False if 'Delegate' in row['office']: return False if 'Comm' in row['office']: return False return super()._should_be_recorded(row) class BlairPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = BlairPDFStringIterator _pdf_table_parser_clazz = BlairPDFTableParser _header = BLAIR_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(BLAIR_FILE), csv.DictWriter(f, OUTPUT_HEADER), BlairPDFPageParser)
if self._office != 'STATISTICS': vote_percent_string = next(self._string_iterator) assert '%' in vote_percent_string @classmethod def _clean_row(cls, row): print(row) super()._clean_row(row) row['office'] = row['office'].title() row['candidate'] = row['candidate'].title() @classmethod def _should_be_recorded(cls, row): if not super()._should_be_recorded(row): return False if 'Committee' in row['office']: return False return True class LebanonPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = LebanonPDFStringIterator _pdf_table_parser_clazz = LebanonPDFTableParser _header = LEBANON_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(LEBANON_FILE), csv.DictWriter(f, OUTPUT_HEADER), LebanonPDFPageParser)
def _populate_votes(self, row): super()._populate_votes(row) if self._office != 'STATISTICS' and row not in BUGGY_ROWS: vote_percent_string = next(self._string_iterator) assert '%' in vote_percent_string @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() @classmethod def _should_be_recorded(cls, row): if not super()._should_be_recorded(row): return False if row['office'] == 'Borough Of Mahanoy City Mahanoy City': return False return True class SchuylkillPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = SchuylkillPDFStringIterator _pdf_table_parser_clazz = SchuylkillPDFTableParser _header = SCHUYLKILL_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(SCHUYLKILL_FILE), csv.DictWriter(f, OUTPUT_HEADER), SchuylkillPDFPageParser)
} class LackawannaPDFStringIterator(ElectionwarePDFStringIterator): _first_footer_substring = FIRST_FOOTER_SUBSTRING _second_footer_substring = SECOND_FOOTER_SUBSTRING class LackawannaPDFTableParser(ElectionwarePDFTableParser): _county = COUNTY _expected_table_headers = EXPECTED_TABLE_HEADERS _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT @classmethod def _clean_row(cls, row): super()._clean_row(row) row['office'] = row['office'].title() class LackawannaPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = LackawannaPDFStringIterator _pdf_table_parser_clazz = LackawannaPDFTableParser _header = LACKAWANNA_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(LACKAWANNA_FILE), csv.DictWriter(f, OUTPUT_HEADER), LackawannaPDFPageParser)
class NorthamptonPDFTableParser(ElectionwarePDFTableParser): _county = COUNTY _expected_table_headers = EXPECTED_TABLE_HEADERS _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT @classmethod def _should_be_recorded(cls, row): if not super()._should_be_recorded(row): return False if 'County Committee' in row['office']: return False if row['office'] == 'Library Tax Question': return False return True class NorthamptonPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = NorthamptonPDFStringIterator _pdf_table_parser_clazz = NorthamptonPDFTableParser _header = NORTHAMPTON_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(NORTHAMPTON_FILE), csv.DictWriter(f, OUTPUT_HEADER), NorthamptonPDFPageParser)
class AdamsPDFTableParser(ElectionwarePDFTableParser): _county = COUNTY _expected_table_headers = EXPECTED_TABLE_HEADERS _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT def _verify_table_header(self): if self._office != 'STATISTICS': vote_percent_header = next(self._string_iterator) assert vote_percent_header == 'VOTE %' super()._verify_table_header() def _populate_votes(self, row): super()._populate_votes(row) if self._office != 'STATISTICS' and row[ 'candidate'] != 'Contest Totals': vote_percent_string = next(self._string_iterator) assert '%' in vote_percent_string class AdamsPDFPageParser(ElectionwarePDFPageParser): _pdf_string_iterator_clazz = AdamsPDFStringIterator _pdf_table_parser_clazz = AdamsPDFTableParser _header = ADAMS_HEADER if __name__ == "__main__": with open(OUTPUT_FILE, 'w', newline='') as f: pdf_to_csv(PDFPageIterator(ADAMS_FILE), csv.DictWriter(f, OUTPUT_HEADER), AdamsPDFPageParser)