def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            if "(Introduced by request of another party)" in sponsor:
                sponsor = sponsor.replace(
                    " (Introduced by request of another party)", "")
            b.add_sponsorship(sponsor, "primary", "person", True)

        self.parse_bill_versions_table(b, versions)
        self.parse_testimony(b, bill_page)
        self.parse_cmte_reports(b, bill_page)

        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
Esempio n. 2
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        if details is None:
            return

        (
            senate_url,
            assembly_url,
            bill_chamber,
            bill_type,
            bill_id,
            title,
            (prefix, number, active_version),
        ) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data["summary"],
            classification=bill_type,
        )

        if bill_data["summary"]:
            bill.add_abstract(bill_data["summary"], note="")

        bill_active_version = None

        if active_version != "":
            bill_active_version = bill_data["amendments"]["items"][active_version]
        else:
            self.warning("No active version for {}".format(bill_id))

        # Parse sponsors.
        if bill_data["sponsor"] is not None:
            if bill_data["sponsor"]["rules"] is True:
                bill.add_sponsorship(
                    "Rules Committee",
                    entity_type="organization",
                    classification="primary",
                    primary=True,
                )
            elif not bill_data["sponsor"]["budget"]:
                primary_sponsor = bill_data["sponsor"]["member"]
                bill.add_sponsorship(
                    primary_sponsor["shortName"],
                    entity_type="person",
                    classification="primary",
                    primary=True,
                )

                if bill_active_version:
                    # There *shouldn't* be cosponsors if there is no sponsor.
                    cosponsors = bill_active_version["coSponsors"]["items"]
                    for cosponsor in cosponsors:
                        bill.add_sponsorship(
                            cosponsor["shortName"],
                            entity_type="person",
                            classification="cosponsor",
                            primary=False,
                        )

        if bill_active_version:
            # List companion bill.
            same_as = bill_active_version.get("sameAs", {})
            # Check whether "sameAs" property is populated with at least one bill.
            if same_as["items"]:
                # Get companion bill ID.
                companion_bill_id = same_as["items"][0]["basePrintNo"]

                # Build companion bill session.
                start_year = same_as["items"][0]["session"]
                end_year = start_year + 1
                companion_bill_session = "-".join([str(start_year), str(end_year)])

                # Attach companion bill data.
                bill.add_related_bill(
                    companion_bill_id, companion_bill_session, relation_type="companion"
                )

        # Parse actions.
        chamber_map = {"senate": "upper", "assembly": "lower"}

        for action in bill_data["actions"]["items"]:
            chamber = chamber_map[action["chamber"].lower()]
            action_datetime = datetime.datetime.strptime(action["date"], "%Y-%m-%d")
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action["text"])

            bill.add_action(
                action["text"],
                action_date.strftime("%Y-%m-%d"),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources["bill"].format(
            session_year=session, bill_id=bill_id, summary="", detail=""
        )
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        for vote_data in bill_data["votes"]["items"]:
            yield self._parse_senate_votes(vote_data, bill, api_url)
        yield from self.scrape_assembly_votes(session, bill, assembly_url, bill_id)

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data["amendments"]["items"]
        for key, amendment in amendments.items():
            version = amendment["printNo"]

            html_url = (
                "http://assembly.state.ny.us/leg/?sh=printbill&bn="
                "{}&term={}&Text=Y".format(bill_id, self.term_start_year)
            )
            bill.add_version_link(
                version, html_url, on_duplicate="ignore", media_type="text/html"
            )

            pdf_url = "http://legislation.nysenate.gov/pdf/bills/{}/{}".format(
                self.term_start_year, version
            )
            bill.add_version_link(
                version, pdf_url, on_duplicate="ignore", media_type="application/pdf"
            )

        yield bill
Esempio n. 3
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[:subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = (page.xpath('//span[@id="lblCompNumber"]/a')
                            [0].text_content().strip())
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']")
                   [0].text_content().split("by")[-1])
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(sponsor,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link("Current Version",
                              btext.get("href"),
                              media_type="application/pdf")

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_version_link(
                "Amendment " + amendment.text,
                amendment.get("href"),
                media_type="application/pdf",
            )
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(afn.get("alt"),
                                   afn.getparent().get("href"),
                                   on_duplicate="ignore")

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")
                [0].text_content().split("by")[-1])
            secondary_sponsor = (secondary_sponsor.replace("*", "").replace(
                ")", "").strip())
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            if page.xpath("//table[@id='gvCoActionHistory']"):
                cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
                actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
Esempio n. 4
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.to_csv("MAINBILL.TXT")

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec["IdenticalBillNumber"].strip():
                bill.add_related_bill(
                    rec["IdenticalBillNumber"].split()[0],
                    legislative_session=session,
                    relation_type="companion",
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.to_csv("BILLSPON.TXT")

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in sponsor database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == "P":
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(
                name,
                classification=sponsor_type,
                entity_type="person",
                primary=sponsor_type == "primary",
            )

        # Documents
        bill_document_csv = self.to_csv("BILLWP.TXT")

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in document database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split("\\")
            document = document[-2] + "/" + document[-1]

            htm_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format(
                year_abr, document.replace(".DOC", ".HTM"))
            pdf_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format(
                year_abr, document.replace(".DOC", ".PDF"))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec["DocType"]]
            except KeyError:
                raise Exception("unknown doctype %s on %s" %
                                (rec["DocType"], bill_id))
            if rec["Comment"]:
                doc_name += " " + rec["Comment"]

            # Clean links.
            if htm_url.endswith("HTMX"):
                htm_url = re.sub("X$", "", htm_url)
            if pdf_url.endswith("PDFX"):
                pdf_url = re.sub("X$", "", pdf_url)

            if rec["DocType"] in self._version_types:
                if htm_url.lower().endswith("htm"):
                    mimetype = "text/html"
                elif htm_url.lower().endswith("wpd"):
                    mimetype = "application/vnd.wordperfect"
                try:
                    bill.add_version_link(doc_name,
                                          htm_url,
                                          media_type=mimetype)
                    bill.add_version_link(doc_name,
                                          pdf_url,
                                          media_type="application/pdf")
                except ValueError:
                    self.warning(
                        "Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            "A%s" % year_abr,
            "A%s" % next_year,
            "S%s" % year_abr,
            "S%s" % next_year,
            "CA%s-%s" % (year_abr, next_year),
            "CS%s-%s" % (year_abr, next_year),
        ]
        # keep votes clean globally, a few votes show up in multiple files
        votes = {}

        for filename in vote_info_list:
            s_vote_url = f"https://www.njleg.state.nj.us/votes/{filename}.zip"
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.HTTPError:
                self.warning("could not find %s" % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"),
                                                 encoding="latin-1")
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)
                if filename.startswith("A") or filename.startswith("CA"):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith("C"):
                    vote_file_type = "committee"
                else:
                    vote_file_type = "chamber"

                for rec in vdict_file:
                    if vote_file_type == "chamber":
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = "%s%s" % (rec["Bill_Type"],
                                            rec["Bill_Number"])
                        leg = rec["Name"]
                        # drop time portion
                        date = rec["Agenda_Date"].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec["BillAction"]]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec["LegislatorVote"][0:1]
                        committee = rec["Committee_House"]
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = "_".join(vote_parts).replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification="passage",
                            result=None,
                            bill=bill_dict[bill_id],
                        )
                        votes[vote_id].dedupe_key = vote_id
                    if leg_vote == "Y":
                        votes[vote_id].vote("yes", leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote("no", leg)
                    else:
                        votes[vote_id].vote("other", leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count["option"]] += 1
                vote.set_count("yes", counts["yes"])
                vote.set_count("no", counts["no"])
                vote.set_count("other", counts["other"])

                # Veto override.
                if vote.motion_text == "OVERRIDE":
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if "lower" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 54 else "fail"
                    elif "upper" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 27 else "fail"
                else:
                    # Regular vote.
                    vote.result = "pass" if counts["yes"] > counts[
                        "no"] else "fail"

                vote.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield vote

        # Actions
        bill_action_csv = self.to_csv("BILLHIST.TXT")
        actor_map = {"A": "lower", "G": "executive", "S": "upper"}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in action database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = dateutil.parser.parse(date)
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += " " + comment
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.to_csv("BILLSUBJ.TXT")
        for rec in subject_csv:
            bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"]))
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in subject database" % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec["SubjectKey"])
            else:
                self.warning("invalid bill id in BillSubj: %s" % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning("probable phony bill detected %s",
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield bill

        if phony_bill_count:
            self.warning("%s total phony bills detected", phony_bill_count)
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(name="Adam Smith")
    lower = Organization.objects.create(jurisdiction_id="jid",
                                        name="House",
                                        classification="lower")
    Membership.objects.create(person_id=person.id, organization_id=lower.id)
    Organization.objects.create(
        jurisdiction_id="jid",
        name="Arbitrary Committee",
        classification="committee",
        parent=lower,
    )

    oldbill = ScrapeBill(
        "HB 99",
        "1899",
        "Axe & Tack Tax Act",
        classification="tax bill",
        chamber="lower",
    )

    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      classification="tax bill",
                      chamber="lower")
    bill.subject = ["taxes", "axes"]
    bill.add_identifier("SB 9")
    bill.add_title("Tack & Axe Tax Act")
    bill.add_action("introduced in house", "1900-04-01", chamber="lower")
    act = bill.add_action("sent to arbitrary committee",
                          "1900-04-04",
                          chamber="lower")
    act.add_related_entity(
        "arbitrary committee",
        "organization",
        _make_pseudo_id(name="Arbitrary Committee"),
    )
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship(
        "Adam Smith",
        classification="extra sponsor",
        entity_type="person",
        primary=False,
        entity_id=_make_pseudo_id(name="Adam Smith"),
    )
    bill.add_sponsorship("Jane Smith",
                         classification="lead sponsor",
                         entity_type="person",
                         primary=True)
    bill.add_abstract(
        "This is an act about axes and taxes and tacks.",
        note="official",
        date="1969-10-20",
    )
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.pdf",
                           media_type="application/pdf")
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.html",
                           media_type="text/html")
    bill.add_version_link("Fiscal Note",
                          "http://example.com/v/1",
                          media_type="text/html")
    bill.add_source("http://example.com/source")

    # import bill
    BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier="HB 1")
    assert b.from_organization.classification == "lower"
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ["taxes", "axes"]
    assert b.abstracts.get().note == "official"
    assert b.abstracts.get().date == "1969-10-20"

    # other_title, other_identifier added
    assert b.other_titles.get().title == "Tack & Axe Tax Act"
    assert b.other_identifiers.get().identifier == "SB 9"

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification="lower")
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert actions[1].related_entities.get(
    ).organization == Organization.objects.get(classification="committee")

    # action computed fields
    assert b.first_action_date == "1900-04-01"
    assert b.latest_action_date == "1900-04-04"
    assert b.latest_action_description == "sent to arbitrary committee"

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == "HB 99"

    # and bill got resolved
    assert rb.related_bill.identifier == "HB 99"

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name="Adam Smith")
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1