Esempio n. 1
0
 def _map_row_to_vote_object(self, row):
     return VoteObject(
         vote_id=row[0],
         blob=row[1],
         sourceUrl=row[2],
         sourceType=row[3],
         sourceFormat=row[4],
         isProcessed=row[5],
     )
def test_itShouldProcessARollCallPDFBlob():
    test_pdf = _load_pdf("hb_roll_call.pdf")
    expected_name_to_vote_tuples = [('ABERCROMBIE', 'Y'), ('PORTER', 'Y'), ('CANDELORA, V.', 'Y'), 
    ('PISCOPO', 'Y'), ('ADAMS', 'Y'), ('REED', 'Y'), ('CARNEY', 'Y'), ('POLLETTA', 'Y'), 
    ('ALBIS', 'Y'), ('REYES', 'Y'), ('CARPINO', 'Y'), ('REBIMBAS', 'Y'), ('ALTOBELLO', 'Y'), 
    ('RILEY', 'Y'), ('CASE', 'Y'), ('RUTIGLIANO', 'Y'), ('ARCONTI', 'Y'), ('RITTER', 'Y'), 
    ('CHEESEMAN', 'Y'), ('SAMPSON', 'Y'), ('BAKER', 'Y'), ('ROJAS', 'Y'), ('CUMMINGS', 'Y'), 
    ('SIEGRIST', 'Y'), ('BORER', 'Y'), ('ROSARIO', 'Y'), ('D’AMELIO', 'Y'), ('SIMANSKI', 'Y'), 
    ('BOYD', 'Y'), ('ROSE', 'Y'), ('DAUPHINAIS', 'Y'), ('SKULCZYCK', 'Y'), ('BUTLER', 'Y'), 
    ('ROVERO', 'Y'), ('DAVIS', 'Y'), ('SMITH', 'Y'), ('CONLEY', 'Y'), ('SANCHEZ', 'Y'), 
    ('DELNICKI', 'Y'), ('SREDZINSKI', 'Y'), ('CURREY', 'Y'), ('SANTIAGO, E.', 'Y'), ('DEVLIN', 'Y'), 
    ('SRINIVASAN', 'Y'), ('D’AGOSTINO', 'Y'), ('SANTIAGO, H.', 'Y'), ('DUBITSKY', 'Y'), 
    ('STANESKI', 'Y'), ('DE LA CRUZ', 'Y'), ('SCANLON', 'Y'), ('DUFF', 'Y'), ('STOKES', 'Y'), 
    ('DEMICCO', 'Y'), ('SERRA', 'Y'), ('DUNSBY', 'Y'), ('STORMS', 'Y'), ('DILLON', 'Y'), 
    ('SIMMONS', 'Y'), ('FERGUSON', 'Y'), ('TWEEDIE', 'Y'), ('DIMASSA', 'Y'), ('SLAP', 'Y'), 
    ('FERRARO', 'Y'), ('VAIL', 'Y'), ('ELLIOTT', 'Y'), ('SOTO', 'Y'), ('FISHBEIN', 'Y'), 
    ('WILMS', 'Y'), ('FLEISCHMANN', 'Y'), ('STAFSTROM', 'Y'), ('FLOREN', 'X'), ('WILSON', 'Y'), 
    ('FOX', 'Y'), ('STALLWORTH', 'Y'), ('FRANCE', 'X'), ('WOOD', 'Y'), ('GENGA', 'Y'), 
    ('STEINBERG', 'Y'), ('FREY', 'Y'), ('YACCARINO', 'Y'), ('GIBSON', 'Y'), ('TERCYAK', 'Y'), 
    ('FUSCO', 'Y'), ('ZAWISTOWSKI', 'Y'), ('GONZALEZ', 'Y'), ('TONG', 'Y'), ('GREEN', 'Y'), 
    ('ZIOBRON', 'Y'), ('GRESKO', 'Y'), ('URBAN', 'Y'), ('HALL, C.', 'Y'), ('ZUPKUS', 'Y'), 
    ('GUERRERA', 'Y'), ('VARGAS', 'Y'), ('HARDING', 'Y'), ('HADDAD', 'Y'), ('VERRENGIA', 'Y'), 
    ('KLARIDES', 'Y'), ('HALL, J.', 'Y'), ('WALKER', 'Y'), ('KLARIDES-DITRIA', 'Y'), 
    ('HAMPTON', 'Y'), ('WINKLER', 'Y'), ('KOKORUDA', 'Y'), ('ARESIMOWICZ', 'Y'), ('HENNESSY', 'Y'), 
    ('YOUNG', 'Y'), ('KUPCHICK', 'Y'), ('JOHNSON', 'Y'), ('ZIOGAS', 'Y'), ('LABRIOLA', 'X'), 
    ('JULESON-SCOPINO', 'Y'), ('LAVIELLE', 'Y'), ('GODFREY', 'Y'), ('LEMAR', 'Y'), 
    ('LEGEYT', 'Y'), ('LESSER', 'Y'), ('MACLACHLAN', 'Y'), ('LINEHAN', 'Y'), ('ACKERT', 'Y'), 
    ('MCCARTY, K.', 'Y'), ('BERGER', 'Y'), ('LOPES', 'Y'), ('BELSITO', 'Y'), ('MCGORTY, B.', 'Y'), 
    ('CANDELARIA, J.', 'Y'), ('MCCARTHY VAHEY', 'Y'), ('BETTS', 'Y'), ('O’DEA', 'Y'), 
    ('COOK', 'Y'), ('MCGEE', 'Y'), ('BOCCHINO', 'Y'), ('OHLER', 'Y'), ('GENTILE', 'Y'), 
    ('MILLER, P.B.', 'Y'), ('BOLINSKY', 'Y'), ('O’NEILL', 'Y'), ('MORIN', 'Y'), ('MUSHINSKY', 'Y'), 
    ('BUCKBEE', 'Y'), ('PAVALOCK-D’AMATO', 'Y'), ('MORRIS', 'Y'), ('PAOLILLO', 'Y'), ('BYRON', 'Y'), 
    ('PERILLO', 'Y'), ('ORANGE', 'Y'), ('PERONE', 'Y'), ('CAMILLO', 'Y'), ('PETIT', 'Y'), 
    ('RYAN', 'Y')]

    actual = processor.process_blob(
        VoteObject(
            blob=test_pdf,
            sourceUrl="https://repp.localhost/2020/foo/2020SV-00052-R00HB-6004-SV.PDF",
            sourceType="CT_STATE_GOV",
            sourceFormat="PDF",
            isProcessed=0,
            vote_id=1
        )
    )

    # idk, but I don't care enough to debug this
    assert abs(actual.unixTime - 1588219200.0) < 20000
    assert actual.billNumber == 'HB-5235'
    assert actual.voteName == 'HB-5235'

    actual_name_to_vote_tuples = []
    for name, vote in zip(actual.repName, actual.repVote ):
        actual_name_to_vote_tuples.append((name, vote))

    assert actual_name_to_vote_tuples == expected_name_to_vote_tuples
Esempio n. 3
0
 def test_itCanGetRecordById(self):
     self.dao.write(
         VoteObject(
             blob="foo_blob",
             sourceUrl="http://foo.localhost.com",
             sourceFormat=SourceFormat.PDF.name,
             sourceType=SourceType.CT_STATE_GOV.name,
             isProcessed=0,
             vote_id=1,
         ))
     expected = VoteObject(
         blob="foo_blob",
         sourceUrl="http://foo.localhost.com",
         sourceFormat=SourceFormat.PDF.name,
         sourceType=SourceType.CT_STATE_GOV.name,
         isProcessed=0,
         vote_id=1,
     )
     actual = self.dao.getById(1)
     assert expected == actual
Esempio n. 4
0
 def _helper_insert_n_records(self, count: int):
     inserted = []
     for i in range(0, count):
         to_write = VoteObject(
             vote_id=i + 1,
             blob=f"foo_blob_{i}",
             sourceUrl=f"http://foo.localhost.com?foo={i}",
             sourceFormat=SourceFormat.PDF.name,
             sourceType=SourceType.CT_STATE_GOV.name,
             isProcessed=0)
         self.dao.write(to_write)
         inserted.append(to_write)
     return inserted
Esempio n. 5
0
    def test_itCanGetProcessed(self):
        self.dao.write(TEST_VOTE_OBJECT)
        self.dao.markProcessedBySourceUrl(TEST_VOTE_OBJECT.sourceUrl)
        expected = VoteObject(blob=TEST_VOTE_OBJECT.blob,
                              sourceUrl=TEST_VOTE_OBJECT.sourceUrl,
                              sourceType=TEST_VOTE_OBJECT.sourceType,
                              sourceFormat=TEST_VOTE_OBJECT.sourceFormat,
                              isProcessed=1,
                              vote_id=1)
        actual = self.dao.getProcessed()

        assert 1 == len(actual)
        assert expected == actual[0]
def test_itShouldThrowOnUnprocessablePdfs():
    test_pdf = _load_pdf("bad_pdf.pdf")
    with pytest.raises(PdfProcessorException):
        processor.process_blob(
            VoteObject(
                blob=test_pdf,
                sourceUrl="https://repp.localhost/foo/foo.pdf",
                sourceType="CT_STATE_GOV",
                sourceFormat="PDF",
                isProcessed=0,
                vote_id=1
            )
        )
Esempio n. 7
0
    def test_itCanGetAllWithIsProcessedFilter(self):
        expected = VoteObject(blob="foo_blob_1",
                              sourceUrl="http://foo.localhost.com?foo=1",
                              sourceFormat=SourceFormat.PDF.name,
                              sourceType=SourceType.CT_STATE_GOV.name,
                              isProcessed=1,
                              vote_id=2)

        self._helper_insert_n_records(10)
        self.dao.markProcessedBySourceUrl("http://foo.localhost.com?foo=1")
        q_filter = VoteObjectFilter(isProcessed=1)

        all_objects = self.dao.getAll(q_filter=q_filter)
        assert [expected] == all_objects
Esempio n. 8
0
    def crawl(self):
        logging.info("CtGovCrawler crawl")
        vote_object_urls = self._get_vote_object_download_urls()
        logging.info(f"Got {len(vote_object_urls)} to ingest")
        for num, i in enumerate(vote_object_urls):
            if self.voteObjectDao.isUrlIngested(i):
                logging.info(f"Skipping request for already ingested url {i}")
                continue
            try:
                logging.info(f"Get pdf {num}/{len(vote_object_urls)}")
                blob = self._download_vote_object(i)
            except requests.exceptions.HttpError as e:
                logging.exception(
                    "HTTPError caught when making request to resource server")
                continue

            # TODO: store the format and type ints in an enum
            vote_object = VoteObject(blob=blob,
                                     sourceUrl=i,
                                     sourceType=SourceType.CT_STATE_GOV.value,
                                     sourceFormat=SourceFormat.PDF.value)
            self.voteObjectDao.write(vote_object)
def test_itShouldProcessAPDFBlob():
    test_pdf = _load_pdf("ct_normal_01.pdf")
    expected_name_to_vote_tuples = [
        ('JOHN W. FONFARA', 'Y'), ('CATHERINE A. OSTEN', 'Y'), ('DOUGLAS MCCRORY', 'Y'),
        ('PAUL M. FORMICA', 'N'), ('SAUD ANWAR', 'Y'), ('KEVIN KELLY', 'N'),
        ('STEVE CASSANO', 'Y'), ('MARILYN MOORE', 'Y'), ('DEREK SLAP', 'Y'),
        ('DENNIS BRADLEY', 'Y'), ('GENNARO BIZZARRO', 'N'), ('JULIE KUSHNER', 'Y'),
        ('JOHN A. KISSEL', 'N'), ('BOB DUFF', 'Y'), ('KEVIN D. WITKOS', 'N'),
        ('WILL HASKELL', 'Y'), ('MATTHEW LESSER', 'Y'), ('CARLO LEONE', 'Y'),
        ('GARY WINFIELD', 'Y'), ('TONY HWANG', 'N'), ('MARTIN M. LOONEY', 'Y'),
        ('MAE FLEXER', 'Y'), ('CHRISTINE COHEN', 'Y'), ('CRAIG MINER', 'N'),
        ('MARY ABRAMS', 'Y'), ('HENRI MARTIN', 'N'), ('JAMES MARONEY', 'Y'),
        ('ERIC BERTHEL', 'N'), ('JOAN V. HARTLEY', 'N'), ('NORM NEEDLEMAN', 'Y'),
        ('ROBERT SAMPSON', 'N'), ('LEONARD FASANO', 'N'), ('GEORGE LOGAN', 'N'),
        ('DAN CHAMPAGNE', 'N'), ('HEATHER SOMERS', 'N'), ('ALEX KASSER', 'Y'),
    ]
    actual = processor.process_blob(
        VoteObject(
            blob=test_pdf,
            sourceUrl="https://repp.localhost/2020/foo/2020SV-00052-R00HB-6004-SV.PDF",
            sourceType="CT_STATE_GOV",
            sourceFormat="PDF",
            isProcessed=0,
            vote_id=1
        )
    )

    # idk, but I don't care enough to debug this
    assert abs(actual.unixTime - 1595995200.0) < 20000
    assert actual.billNumber == 'HB-6004'
    assert actual.voteName == 'HB-6004'

    actual_name_to_vote_tuples = []
    for name, vote in zip(actual.repName, actual.repVote ):
        actual_name_to_vote_tuples.append((name, vote))
    assert actual_name_to_vote_tuples == expected_name_to_vote_tuples
Esempio n. 10
0
from rep.dao.VoteObjectDao import VoteObjectDao
from rep.dataclasses.VoteObject import VoteObject
from rep.dataclasses.VoteObjectFilter import VoteObjectFilter
from rep.dataclasses.Enums import SourceFormat, SourceType
from . import get_test_dao

TEST_VOTE_OBJECT = VoteObject(
    blob="foo_blob",
    sourceUrl="http://foo.localhost.com/VOTE/foo.pdf",
    sourceFormat=SourceFormat.PDF.name,
    sourceType=SourceType.CT_STATE_GOV.name,
    isProcessed=0)


class Test:
    def setup_method(self, test_method):
        self.dao = get_test_dao(VoteObjectDao, True)

    def _helper_insert_n_records(self, count: int):
        inserted = []
        for i in range(0, count):
            to_write = VoteObject(
                vote_id=i + 1,
                blob=f"foo_blob_{i}",
                sourceUrl=f"http://foo.localhost.com?foo={i}",
                sourceFormat=SourceFormat.PDF.name,
                sourceType=SourceType.CT_STATE_GOV.name,
                isProcessed=0)
            self.dao.write(to_write)
            inserted.append(to_write)
        return inserted