def _map_row_to_vote_object(self, row): return VoteObject( vote_id=row[0], blob=row[1], sourceUrl=row[2], sourceType=row[3], sourceFormat=row[4], isProcessed=row[5], )
def test_itShouldProcessARollCallPDFBlob(): test_pdf = _load_pdf("hb_roll_call.pdf") expected_name_to_vote_tuples = [('ABERCROMBIE', 'Y'), ('PORTER', 'Y'), ('CANDELORA, V.', 'Y'), ('PISCOPO', 'Y'), ('ADAMS', 'Y'), ('REED', 'Y'), ('CARNEY', 'Y'), ('POLLETTA', 'Y'), ('ALBIS', 'Y'), ('REYES', 'Y'), ('CARPINO', 'Y'), ('REBIMBAS', 'Y'), ('ALTOBELLO', 'Y'), ('RILEY', 'Y'), ('CASE', 'Y'), ('RUTIGLIANO', 'Y'), ('ARCONTI', 'Y'), ('RITTER', 'Y'), ('CHEESEMAN', 'Y'), ('SAMPSON', 'Y'), ('BAKER', 'Y'), ('ROJAS', 'Y'), ('CUMMINGS', 'Y'), ('SIEGRIST', 'Y'), ('BORER', 'Y'), ('ROSARIO', 'Y'), ('D’AMELIO', 'Y'), ('SIMANSKI', 'Y'), ('BOYD', 'Y'), ('ROSE', 'Y'), ('DAUPHINAIS', 'Y'), ('SKULCZYCK', 'Y'), ('BUTLER', 'Y'), ('ROVERO', 'Y'), ('DAVIS', 'Y'), ('SMITH', 'Y'), ('CONLEY', 'Y'), ('SANCHEZ', 'Y'), ('DELNICKI', 'Y'), ('SREDZINSKI', 'Y'), ('CURREY', 'Y'), ('SANTIAGO, E.', 'Y'), ('DEVLIN', 'Y'), ('SRINIVASAN', 'Y'), ('D’AGOSTINO', 'Y'), ('SANTIAGO, H.', 'Y'), ('DUBITSKY', 'Y'), ('STANESKI', 'Y'), ('DE LA CRUZ', 'Y'), ('SCANLON', 'Y'), ('DUFF', 'Y'), ('STOKES', 'Y'), ('DEMICCO', 'Y'), ('SERRA', 'Y'), ('DUNSBY', 'Y'), ('STORMS', 'Y'), ('DILLON', 'Y'), ('SIMMONS', 'Y'), ('FERGUSON', 'Y'), ('TWEEDIE', 'Y'), ('DIMASSA', 'Y'), ('SLAP', 'Y'), ('FERRARO', 'Y'), ('VAIL', 'Y'), ('ELLIOTT', 'Y'), ('SOTO', 'Y'), ('FISHBEIN', 'Y'), ('WILMS', 'Y'), ('FLEISCHMANN', 'Y'), ('STAFSTROM', 'Y'), ('FLOREN', 'X'), ('WILSON', 'Y'), ('FOX', 'Y'), ('STALLWORTH', 'Y'), ('FRANCE', 'X'), ('WOOD', 'Y'), ('GENGA', 'Y'), ('STEINBERG', 'Y'), ('FREY', 'Y'), ('YACCARINO', 'Y'), ('GIBSON', 'Y'), ('TERCYAK', 'Y'), ('FUSCO', 'Y'), ('ZAWISTOWSKI', 'Y'), ('GONZALEZ', 'Y'), ('TONG', 'Y'), ('GREEN', 'Y'), ('ZIOBRON', 'Y'), ('GRESKO', 'Y'), ('URBAN', 'Y'), ('HALL, C.', 'Y'), ('ZUPKUS', 'Y'), ('GUERRERA', 'Y'), ('VARGAS', 'Y'), ('HARDING', 'Y'), ('HADDAD', 'Y'), ('VERRENGIA', 'Y'), ('KLARIDES', 'Y'), ('HALL, J.', 'Y'), ('WALKER', 'Y'), ('KLARIDES-DITRIA', 'Y'), ('HAMPTON', 'Y'), ('WINKLER', 'Y'), ('KOKORUDA', 'Y'), ('ARESIMOWICZ', 'Y'), ('HENNESSY', 'Y'), ('YOUNG', 'Y'), ('KUPCHICK', 'Y'), ('JOHNSON', 'Y'), ('ZIOGAS', 'Y'), ('LABRIOLA', 'X'), ('JULESON-SCOPINO', 'Y'), ('LAVIELLE', 'Y'), ('GODFREY', 'Y'), ('LEMAR', 'Y'), ('LEGEYT', 'Y'), ('LESSER', 'Y'), ('MACLACHLAN', 'Y'), ('LINEHAN', 'Y'), ('ACKERT', 'Y'), ('MCCARTY, K.', 'Y'), ('BERGER', 'Y'), ('LOPES', 'Y'), ('BELSITO', 'Y'), ('MCGORTY, B.', 'Y'), ('CANDELARIA, J.', 'Y'), ('MCCARTHY VAHEY', 'Y'), ('BETTS', 'Y'), ('O’DEA', 'Y'), ('COOK', 'Y'), ('MCGEE', 'Y'), ('BOCCHINO', 'Y'), ('OHLER', 'Y'), ('GENTILE', 'Y'), ('MILLER, P.B.', 'Y'), ('BOLINSKY', 'Y'), ('O’NEILL', 'Y'), ('MORIN', 'Y'), ('MUSHINSKY', 'Y'), ('BUCKBEE', 'Y'), ('PAVALOCK-D’AMATO', 'Y'), ('MORRIS', 'Y'), ('PAOLILLO', 'Y'), ('BYRON', 'Y'), ('PERILLO', 'Y'), ('ORANGE', 'Y'), ('PERONE', 'Y'), ('CAMILLO', 'Y'), ('PETIT', 'Y'), ('RYAN', 'Y')] actual = processor.process_blob( VoteObject( blob=test_pdf, sourceUrl="https://repp.localhost/2020/foo/2020SV-00052-R00HB-6004-SV.PDF", sourceType="CT_STATE_GOV", sourceFormat="PDF", isProcessed=0, vote_id=1 ) ) # idk, but I don't care enough to debug this assert abs(actual.unixTime - 1588219200.0) < 20000 assert actual.billNumber == 'HB-5235' assert actual.voteName == 'HB-5235' actual_name_to_vote_tuples = [] for name, vote in zip(actual.repName, actual.repVote ): actual_name_to_vote_tuples.append((name, vote)) assert actual_name_to_vote_tuples == expected_name_to_vote_tuples
def test_itCanGetRecordById(self): self.dao.write( VoteObject( blob="foo_blob", sourceUrl="http://foo.localhost.com", sourceFormat=SourceFormat.PDF.name, sourceType=SourceType.CT_STATE_GOV.name, isProcessed=0, vote_id=1, )) expected = VoteObject( blob="foo_blob", sourceUrl="http://foo.localhost.com", sourceFormat=SourceFormat.PDF.name, sourceType=SourceType.CT_STATE_GOV.name, isProcessed=0, vote_id=1, ) actual = self.dao.getById(1) assert expected == actual
def _helper_insert_n_records(self, count: int): inserted = [] for i in range(0, count): to_write = VoteObject( vote_id=i + 1, blob=f"foo_blob_{i}", sourceUrl=f"http://foo.localhost.com?foo={i}", sourceFormat=SourceFormat.PDF.name, sourceType=SourceType.CT_STATE_GOV.name, isProcessed=0) self.dao.write(to_write) inserted.append(to_write) return inserted
def test_itCanGetProcessed(self): self.dao.write(TEST_VOTE_OBJECT) self.dao.markProcessedBySourceUrl(TEST_VOTE_OBJECT.sourceUrl) expected = VoteObject(blob=TEST_VOTE_OBJECT.blob, sourceUrl=TEST_VOTE_OBJECT.sourceUrl, sourceType=TEST_VOTE_OBJECT.sourceType, sourceFormat=TEST_VOTE_OBJECT.sourceFormat, isProcessed=1, vote_id=1) actual = self.dao.getProcessed() assert 1 == len(actual) assert expected == actual[0]
def test_itShouldThrowOnUnprocessablePdfs(): test_pdf = _load_pdf("bad_pdf.pdf") with pytest.raises(PdfProcessorException): processor.process_blob( VoteObject( blob=test_pdf, sourceUrl="https://repp.localhost/foo/foo.pdf", sourceType="CT_STATE_GOV", sourceFormat="PDF", isProcessed=0, vote_id=1 ) )
def test_itCanGetAllWithIsProcessedFilter(self): expected = VoteObject(blob="foo_blob_1", sourceUrl="http://foo.localhost.com?foo=1", sourceFormat=SourceFormat.PDF.name, sourceType=SourceType.CT_STATE_GOV.name, isProcessed=1, vote_id=2) self._helper_insert_n_records(10) self.dao.markProcessedBySourceUrl("http://foo.localhost.com?foo=1") q_filter = VoteObjectFilter(isProcessed=1) all_objects = self.dao.getAll(q_filter=q_filter) assert [expected] == all_objects
def crawl(self): logging.info("CtGovCrawler crawl") vote_object_urls = self._get_vote_object_download_urls() logging.info(f"Got {len(vote_object_urls)} to ingest") for num, i in enumerate(vote_object_urls): if self.voteObjectDao.isUrlIngested(i): logging.info(f"Skipping request for already ingested url {i}") continue try: logging.info(f"Get pdf {num}/{len(vote_object_urls)}") blob = self._download_vote_object(i) except requests.exceptions.HttpError as e: logging.exception( "HTTPError caught when making request to resource server") continue # TODO: store the format and type ints in an enum vote_object = VoteObject(blob=blob, sourceUrl=i, sourceType=SourceType.CT_STATE_GOV.value, sourceFormat=SourceFormat.PDF.value) self.voteObjectDao.write(vote_object)
def test_itShouldProcessAPDFBlob(): test_pdf = _load_pdf("ct_normal_01.pdf") expected_name_to_vote_tuples = [ ('JOHN W. FONFARA', 'Y'), ('CATHERINE A. OSTEN', 'Y'), ('DOUGLAS MCCRORY', 'Y'), ('PAUL M. FORMICA', 'N'), ('SAUD ANWAR', 'Y'), ('KEVIN KELLY', 'N'), ('STEVE CASSANO', 'Y'), ('MARILYN MOORE', 'Y'), ('DEREK SLAP', 'Y'), ('DENNIS BRADLEY', 'Y'), ('GENNARO BIZZARRO', 'N'), ('JULIE KUSHNER', 'Y'), ('JOHN A. KISSEL', 'N'), ('BOB DUFF', 'Y'), ('KEVIN D. WITKOS', 'N'), ('WILL HASKELL', 'Y'), ('MATTHEW LESSER', 'Y'), ('CARLO LEONE', 'Y'), ('GARY WINFIELD', 'Y'), ('TONY HWANG', 'N'), ('MARTIN M. LOONEY', 'Y'), ('MAE FLEXER', 'Y'), ('CHRISTINE COHEN', 'Y'), ('CRAIG MINER', 'N'), ('MARY ABRAMS', 'Y'), ('HENRI MARTIN', 'N'), ('JAMES MARONEY', 'Y'), ('ERIC BERTHEL', 'N'), ('JOAN V. HARTLEY', 'N'), ('NORM NEEDLEMAN', 'Y'), ('ROBERT SAMPSON', 'N'), ('LEONARD FASANO', 'N'), ('GEORGE LOGAN', 'N'), ('DAN CHAMPAGNE', 'N'), ('HEATHER SOMERS', 'N'), ('ALEX KASSER', 'Y'), ] actual = processor.process_blob( VoteObject( blob=test_pdf, sourceUrl="https://repp.localhost/2020/foo/2020SV-00052-R00HB-6004-SV.PDF", sourceType="CT_STATE_GOV", sourceFormat="PDF", isProcessed=0, vote_id=1 ) ) # idk, but I don't care enough to debug this assert abs(actual.unixTime - 1595995200.0) < 20000 assert actual.billNumber == 'HB-6004' assert actual.voteName == 'HB-6004' actual_name_to_vote_tuples = [] for name, vote in zip(actual.repName, actual.repVote ): actual_name_to_vote_tuples.append((name, vote)) assert actual_name_to_vote_tuples == expected_name_to_vote_tuples
from rep.dao.VoteObjectDao import VoteObjectDao from rep.dataclasses.VoteObject import VoteObject from rep.dataclasses.VoteObjectFilter import VoteObjectFilter from rep.dataclasses.Enums import SourceFormat, SourceType from . import get_test_dao TEST_VOTE_OBJECT = VoteObject( blob="foo_blob", sourceUrl="http://foo.localhost.com/VOTE/foo.pdf", sourceFormat=SourceFormat.PDF.name, sourceType=SourceType.CT_STATE_GOV.name, isProcessed=0) class Test: def setup_method(self, test_method): self.dao = get_test_dao(VoteObjectDao, True) def _helper_insert_n_records(self, count: int): inserted = [] for i in range(0, count): to_write = VoteObject( vote_id=i + 1, blob=f"foo_blob_{i}", sourceUrl=f"http://foo.localhost.com?foo={i}", sourceFormat=SourceFormat.PDF.name, sourceType=SourceType.CT_STATE_GOV.name, isProcessed=0) self.dao.write(to_write) inserted.append(to_write) return inserted