Python PDFPageIterator Beispiele, parsers.pa_pdf_parser.PDFPageIterator Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: pa_lycoming_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

        candidate = None
        while candidate != 'Write-in':
            candidate = self._process_candidate(candidate)
            if candidate in VALID_HEADERS:
                yield candidate
                candidate = None
            if candidate == 'Write-in':
                yield candidate

    def _process_candidate(self, candidate):
        s = next(self._string_iterator)
        if not candidate:
            return s
        return candidate + ' ' + s


def pdf_to_csv(pdf, csv_writer):
    csv_writer.writeheader()
    for page in pdf:
        print(f'processing page {page.get_page_number()}')
        pdf_page_parser = LycomingPDFPageParser(page)
        if not pdf_page_parser.skip_page():
            for row in pdf_page_parser:
                csv_writer.writerow(row)


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(LYCOMING_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER))

Beispiel #2

0

Datei anzeigen

Datei: pa_berks_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

            self._table_body_parser = BerksPDFTableBodyParser(
                self._string_iterator, candidates)
            yield from iter(self._table_body_parser)

    def _parse_header(self):
        header = [
            next(self._string_iterator) for _ in range(len(BERKS_HEADER))
        ]
        assert (header == BERKS_HEADER)

    def page_is_done(self):
        if self._string_iterator.peek().startswith(FIRST_FOOTER_SUBSTRING):
            return True
        return self._table_body_parser and self._table_body_parser.page_is_done(
        )


def pdf_to_csv(pdf, csv_writer):
    csv_writer.writeheader()
    for page in pdf:
        print(f'processing page {page.get_page_number()}')
        pdf_page_parser = BerksPDFPageParser(page)
        for row in pdf_page_parser:
            csv_writer.writerow(row._asdict())


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(BERKS_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER))

Beispiel #3

0

Datei anzeigen

Datei: pa_beaver_general_2020_results_parser.py Projekt: saewitz/openelections-data-pa

    _county = COUNTY
    _expected_table_headers = EXPECTED_TABLE_HEADERS
    _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER
    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if 'Del ' in row['office']:
            return False
        if 'Comm' in row['office']:
            return False
        return super()._should_be_recorded(row)


class BeaverPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = BeaverPDFStringIterator
    _pdf_table_parser_clazz = BeaverPDFTableParser
    _header = BEAVER_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(BEAVER_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), BeaverPDFPageParser)

Beispiel #4

0

Datei anzeigen

Datei: pa_bradford_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

            row = self._generate_row(candidate_data, category_votes)
            office_is_invalid = max(invalid_office in row.office
                                    for invalid_office in INVALID_OFFICES)
            if not office_is_invalid:
                yield row

    def _generate_row(self, candidate_data, category_votes):
        row_data = [COUNTY, self._jurisdiction] + list(candidate_data)
        if candidate_data.office == 'Registered Voters':
            assert (min(category_votes) == max(category_votes))
            row_data += [''] * len(VOTE_CATEGORIES) + [category_votes[0]]
        else:
            row_data += category_votes + [sum(category_votes)]
        return ParsedRow(*row_data)

    def _parse_category_cell(self, category):
        new_category = next(self._string_iterator).strip()
        assert (category == new_category)


class BradfordPDFPageParser(PDFPageParser):
    _standard_header = BRADFORD_HEADER
    _table_header_parser = BradfordTableHeaderParser
    _table_body_parser = BradfordTableBodyParser


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(BRADFORD_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), BradfordPDFPageParser)

Beispiel #5

0

Datei anzeigen

Datei: pa_indiana_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

        ]
        assert header == INDIANA_HEADER

    def _process_table_headers(self):
        table_header_parser = IndianaPDFTableHeaderParser(
            self._string_iterator, self._previous_table_header)
        self._table_headers = table_header_parser.get_table_headers()
        return table_header_parser.get_candidates()

    def process_table_body(self, candidates):
        self._table_body_parser = IndianaPDFTableBodyParser(
            self._string_iterator, candidates)
        yield from self._table_body_parser


def pdf_to_csv(pdf, csv_writer):
    previous_table_header = None
    csv_writer.writeheader()
    for page in pdf:
        print(f'processing page {page.get_page_number()}')
        pdf_page_parser = IndianaPDFPageParser(page, previous_table_header)
        for row in pdf_page_parser:
            csv_writer.writerow(row)
        previous_table_header = pdf_page_parser.get_continued_table_header()


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(INDIANA_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER))

Beispiel #6

0

Datei anzeigen

Datei: pa_cumberland_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

def pdfs_to_csv(csv_writer):
    csv_writer.writeheader()
    for party in CUMBERLAND_PARTIES:
        cumberland_file = CUMBERLAND_FILE_FORMAT.format(party)
        pdf_page_iterator = PDFPageIterator(cumberland_file)
        append_pdf_to_csv(pdf_page_iterator, csv_writer, party)

Beispiel #7

0

Datei anzeigen

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if 'Delegate' in row['office']:
            return False
        if 'Committee' in row['office']:
            return False
        if 'Liquor' in row['office']:
            return False
        if 'Council' in row['office']:
            return False
        return super()._should_be_recorded(row)


class ChesterPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = ChesterPDFStringIterator
    _pdf_table_parser_clazz = ChesterPDFTableParser
    _header = CHESTER_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(CHESTER_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), ChesterPDFPageParser)

Beispiel #8

0

Datei anzeigen

Datei: pa_cambria_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa


class CambriaPDFStringIterator(ElectionwarePDFStringIterator):
    _first_footer_substring = FIRST_FOOTER_SUBSTRING
    _second_footer_substring = SECOND_FOOTER_SUBSTRING


class CambriaPDFTableParser(ElectionwarePDFTableParser):
    _county = COUNTY
    _expected_table_headers = EXPECTED_TABLE_HEADERS
    _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER
    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()


class CambriaPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = CambriaPDFStringIterator
    _pdf_table_parser_clazz = CambriaPDFTableParser
    _header = CAMBRIA_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(CAMBRIA_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), CambriaPDFPageParser)

Beispiel #9

0

Datei anzeigen

Datei: pa_clearfield_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if row['candidate'].startswith('Write-In: '):
            # there's already a Write-In Totals field; this prevents double counting
            return False
        if 'Delegate' in row['office']:
            return False
        if 'Comm' in row['office']:
            return False
        return super()._should_be_recorded(row)


class ClearfieldPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = ClearfieldPDFStringIterator
    _pdf_table_parser_clazz = ClearfieldPDFTableParser
    _header = CLEARFIELD_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(CLEARFIELD_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER),
                   ClearfieldPDFPageParser)

Beispiel #10

0

Datei anzeigen

Datei: pa_perry_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

    @staticmethod
    def _row_is_valid(row):
        if 'Delegate' in row['office']:
            return False
        if 'COMMITTEE' in row['office']:
            return False
        if row['candidate'] == LAST_ROW_CANDIDATE:
            return False
        return True


def pdf_to_csv(pdf, csv_writer):
    csv_writer.writeheader()
    previous_table_header = None
    previous_precinct = None
    for page in pdf:
        print(f'processing page {page.get_page_number()}')
        pdf_page_parser = PerryPDFPageParser(page, previous_table_header,
                                             previous_precinct)
        for row in pdf_page_parser:
            csv_writer.writerow(row)
        previous_table_header = pdf_page_parser.get_continued_table_header()
        previous_precinct = pdf_page_parser.get_continued_precinct()


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(PERRY_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER))

Beispiel #11

0

Datei anzeigen

    @classmethod
    def _clean_row(cls, row):
        row['candidate'] = row['candidate'].replace('REPUBLICIAN',
                                                    'REPUBLICAN')
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if row['candidate'].startswith('Write-In: '):
            # there's already a Write-In Totals field; this prevents double counting
            return False
        if 'Del ' in row['office']:
            return False
        if 'Cmte' in row['office']:
            return False
        return super()._should_be_recorded(row)


class ClintonPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = ClintonPDFStringIterator
    _pdf_table_parser_clazz = ClintonPDFTableParser
    _header = CLINTON_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(CLINTON_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), ClintonPDFPageParser)

Beispiel #12

0

Datei anzeigen

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if row['candidate'].startswith('Write-In: '):
            # there's already a Write-In Totals field; this prevents double counting
            return False
        return super()._should_be_recorded(row)


class TiogaPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = TiogaPDFStringIterator
    _pdf_table_parser_clazz = TiogaPDFTableParser
    _header = TIOGA_HEADER

    def __init__(self, page):
        super().__init__(page)
        if page.get_page_number() < FIRST_PER_PRECINCT_PAGE:
            # skip these pages; these are the summary pages
            strings = [FIRST_FOOTER_SUBSTRING]
            self._string_iterator = TiogaPDFStringIterator(strings)


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(TIOGA_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), TiogaPDFPageParser)

Beispiel #13

0

Datei anzeigen

    def _populate_votes(self, row):
        super()._populate_votes(row)
        if '%' in self._string_iterator.peek():
            next(self._string_iterator)  # vote % string, not always supplied

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if 'Delegate' in row['office']:
            return False
        return super()._should_be_recorded(row)


class CentrePDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = CentrePDFStringIterator
    _pdf_table_parser_clazz = CentrePDFTableParser
    _header = CENTRE_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(CENTRE_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER),
                   CentrePDFPageParser)

Beispiel #14

0

Datei anzeigen

Datei: pa_mercer_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

    _county = COUNTY
    _expected_table_headers = EXPECTED_TABLE_HEADERS
    _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER
    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].replace('Write-In: ', '').title()

    @classmethod
    def _should_be_recorded(cls, row):
        if not super()._should_be_recorded(row):
            return False
        if row['office'] == 'Wheatland Home Rule':
            return False
        return True


class MercerPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = MercerPDFStringIterator
    _pdf_table_parser_clazz = MercerPDFTableParser
    _header = MERCER_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(MERCER_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), MercerPDFPageParser)

Beispiel #15

0

Datei anzeigen

    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if row['candidate'].startswith('Write-In: '):
            # there's already a Write-In Totals field; this prevents double counting
            return False
        if 'Delegate' in row['office']:
            return False
        if 'Comm' in row['office']:
            return False
        return super()._should_be_recorded(row)


class MifflinPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = MifflinPDFStringIterator
    _pdf_table_parser_clazz = MifflinPDFTableParser
    _header = MIFFLIN_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(MIFFLIN_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), MifflinPDFPageParser)

Beispiel #16

0

Datei anzeigen

Datei: pa_blair_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

                next(self._string_iterator)

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if row['candidate'].startswith('Write-In: '):
            # there's already a Write-In Totals field; this prevents double counting
            return False
        if 'Delegate' in row['office']:
            return False
        if 'Comm' in row['office']:
            return False
        return super()._should_be_recorded(row)


class BlairPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = BlairPDFStringIterator
    _pdf_table_parser_clazz = BlairPDFTableParser
    _header = BLAIR_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(BLAIR_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), BlairPDFPageParser)

Beispiel #17

0

Datei anzeigen

        if self._office != 'STATISTICS':
            vote_percent_string = next(self._string_iterator)
            assert '%' in vote_percent_string

    @classmethod
    def _clean_row(cls, row):
        print(row)
        super()._clean_row(row)
        row['office'] = row['office'].title()
        row['candidate'] = row['candidate'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if not super()._should_be_recorded(row):
            return False
        if 'Committee' in row['office']:
            return False
        return True


class LebanonPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = LebanonPDFStringIterator
    _pdf_table_parser_clazz = LebanonPDFTableParser
    _header = LEBANON_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(LEBANON_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), LebanonPDFPageParser)

Beispiel #18

0

Datei anzeigen

    def _populate_votes(self, row):
        super()._populate_votes(row)
        if self._office != 'STATISTICS' and row not in BUGGY_ROWS:
            vote_percent_string = next(self._string_iterator)
            assert '%' in vote_percent_string

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()

    @classmethod
    def _should_be_recorded(cls, row):
        if not super()._should_be_recorded(row):
            return False
        if row['office'] == 'Borough Of Mahanoy City Mahanoy City':
            return False
        return True


class SchuylkillPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = SchuylkillPDFStringIterator
    _pdf_table_parser_clazz = SchuylkillPDFTableParser
    _header = SCHUYLKILL_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(SCHUYLKILL_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), SchuylkillPDFPageParser)

Beispiel #19

0

Datei anzeigen

Datei: pa_lackawanna_primary_2020_results_parser.py Projekt: rbier/openelections-data-pa

}


class LackawannaPDFStringIterator(ElectionwarePDFStringIterator):
    _first_footer_substring = FIRST_FOOTER_SUBSTRING
    _second_footer_substring = SECOND_FOOTER_SUBSTRING


class LackawannaPDFTableParser(ElectionwarePDFTableParser):
    _county = COUNTY
    _expected_table_headers = EXPECTED_TABLE_HEADERS
    _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER
    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    @classmethod
    def _clean_row(cls, row):
        super()._clean_row(row)
        row['office'] = row['office'].title()


class LackawannaPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = LackawannaPDFStringIterator
    _pdf_table_parser_clazz = LackawannaPDFTableParser
    _header = LACKAWANNA_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(LACKAWANNA_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), LackawannaPDFPageParser)

Beispiel #20

0

Datei anzeigen


class NorthamptonPDFTableParser(ElectionwarePDFTableParser):
    _county = COUNTY
    _expected_table_headers = EXPECTED_TABLE_HEADERS
    _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER
    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    @classmethod
    def _should_be_recorded(cls, row):
        if not super()._should_be_recorded(row):
            return False
        if 'County Committee' in row['office']:
            return False
        if row['office'] == 'Library Tax Question':
            return False
        return True


class NorthamptonPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = NorthamptonPDFStringIterator
    _pdf_table_parser_clazz = NorthamptonPDFTableParser
    _header = NORTHAMPTON_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(NORTHAMPTON_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER),
                   NorthamptonPDFPageParser)

Beispiel #21

0

Datei anzeigen

class AdamsPDFTableParser(ElectionwarePDFTableParser):
    _county = COUNTY
    _expected_table_headers = EXPECTED_TABLE_HEADERS
    _openelections_mapped_header = OPENELECTIONS_MAPPED_HEADER
    _raw_office_to_office_and_district = RAW_OFFICE_TO_OFFICE_AND_DISTRICT

    def _verify_table_header(self):
        if self._office != 'STATISTICS':
            vote_percent_header = next(self._string_iterator)
            assert vote_percent_header == 'VOTE %'
        super()._verify_table_header()

    def _populate_votes(self, row):
        super()._populate_votes(row)
        if self._office != 'STATISTICS' and row[
                'candidate'] != 'Contest Totals':
            vote_percent_string = next(self._string_iterator)
            assert '%' in vote_percent_string


class AdamsPDFPageParser(ElectionwarePDFPageParser):
    _pdf_string_iterator_clazz = AdamsPDFStringIterator
    _pdf_table_parser_clazz = AdamsPDFTableParser
    _header = ADAMS_HEADER


if __name__ == "__main__":
    with open(OUTPUT_FILE, 'w', newline='') as f:
        pdf_to_csv(PDFPageIterator(ADAMS_FILE),
                   csv.DictWriter(f, OUTPUT_HEADER), AdamsPDFPageParser)