Ejemplo n.º 1
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)
        page.make_links_absolute(self.BASE_URL)

        if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[
                0
            ].strip()
        elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip()
        else:
            self.warning("No bill id for {}".format(url))
            return
        title = page.xpath(
            '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()'
        )[0].strip()

        if "B" in bill_id:
            _type = ["bill"]
        elif "J" in bill_id:
            _type = ["joint resolution"]
        elif "HS" in bill_id:
            _type = ["resolution"]
        else:
            raise ValueError("unknown bill type " + bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=_type,
        )
        bill.add_source(url)

        self.scrape_bill_subjects(bill, page)
        self.scrape_bill_sponsors(bill, page)
        self.scrape_bill_actions(bill, page)

        # fiscal note
        if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'):
            fiscal_note = page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
            )[0]
            fiscal_url = fiscal_note.get("href")
            fiscal_title = fiscal_note.text_content()
            bill.add_document_link(
                fiscal_title, fiscal_url, media_type="application/pdf",
            )

        # effective date, where available
        if page.xpath('//div[contains(text(), "Effective Date(s)")]'):
            eff_date = page.xpath('//div[contains(text(), "Effective Date(s)")]/text()')[0].strip()
            eff_date = eff_date.replace('Effective Date(s):', '').strip()
            # this can contain multiple dates, eg "July 1, 2020, July 1, 2022"
            bill.extras['date_effective'] = eff_date

        # yield from self.parse_bill_votes_new(doc, bill)
        yield bill
Ejemplo n.º 2
0
    def scrape_prefiles(self, session):
        url = 'https://www.legis.iowa.gov/legislation/billTracking/prefiledBills'
        page = lxml.html.fromstring(self.get(url).content)
        page.make_links_absolute(url)

        for row in page.xpath('//table[contains(@class, "sortable")]/tr[td]'):
            title = row.xpath('td[2]/a/text()')[0].strip()
            url = row.xpath('td[2]/a/@href')[0]

            bill_id = self.extract_doc_id(title)

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber='legislature',
                title=title,
                classification='proposed bill',
            )

            if (row.xpath('td[3]/a')):
                document_url = row.xpath('td[3]/a/@href')[0]
                if '.docx' in document_url:
                    media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                elif '.pdf' in document_url:
                    media_type = 'application/pdf'
                bill.add_document_link(
                    note="Backround Statement",
                    url=document_url,
                    media_type=media_type
                )

            bill.add_version_link(
                note="Prefiled",
                url=url,
                media_type="application/pdf"
            )

            bill.add_source(url)

            yield bill
Ejemplo n.º 3
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = (
            "http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml"
            % session)

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath("//LASTACTION/MSRGROUP"):
            bill_id = mr.xpath("string(MEASURE)").replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {
                "B": "bill",
                "C": "concurrent resolution",
                "R": "resolution",
                "N": "nomination",
            }[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath("string(ACTIONLINK)").replace("..", "")
            main_doc = mr.xpath("string(MEASURELINK)").replace("../../../", "")
            main_doc_url = "http://billstatus.ls.state.ms.us/%s" % main_doc
            bill_details_url = "http://billstatus.ls.state.ms.us/%s/pdf%s" % (
                session,
                link,
            )
            try:
                details_page = self.get(bill_details_url)
            except scrapelib.HTTPError:
                self.warning(
                    "Bill page not loading for {}; skipping".format(bill_id))
                continue

            page = details_page.content
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath("string(//SHORTTITLE)")
            longtitle = details_root.xpath("string(//LONGTITLE)")

            if title == "":
                self.warning(f"No title yet for {bill_id}, skipping")
                return

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=title,
                classification=bill_type,
            )
            bill.extras["summary"] = longtitle
            bill.add_source(main_doc_url)
            # sponsors
            main_sponsor = details_root.xpath("string(//P_NAME)").split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath(
                    "string(//P_LINK)").replace(" ", "_")
                main_sponsor_url = ("http://billstatus.ls.state.ms.us/%s/"
                                    "pdf/%s") % (
                                        session,
                                        main_sponsor_link.strip("../"),
                                    )
                type = "primary"
                bill.add_source(main_sponsor_url)
                bill.add_sponsorship(
                    self.clean_voter_name(main_sponsor),
                    classification=type,
                    entity_type="person",
                    primary=True,
                )

            for author in details_root.xpath("//AUTHORS/ADDITIONAL"):
                leg = author.xpath("string(CO_NAME)").replace(" ", "_")
                if leg:
                    leg_url = ("http://billstatus.ls.state.ms.us/%s/"
                               "pdf/House_authors/%s.xml") % (session, leg)
                    type = "cosponsor"
                    bill.add_source(leg_url)
                    bill.add_sponsorship(
                        self.clean_voter_name(leg),
                        classification=type,
                        entity_type="person",
                        primary=False,
                    )
            # Versions
            curr_version = details_root.xpath("string(//CURRENT_OTHER"
                                              ")").replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                bill.add_version_link(
                    "Current version",
                    curr_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                curr_pdf_url = re.sub("html?", "pdf", curr_version_url)
                bill.add_version_link(
                    "Current version",
                    curr_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            intro_version = details_root.xpath(
                "string(//INTRO_OTHER)").replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                bill.add_version_link(
                    "As Introduced",
                    intro_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                intro_pdf_url = re.sub("html?", "pdf", intro_version_url)
                bill.add_version_link(
                    "As Introduced",
                    intro_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            comm_version = details_root.xpath("string(//CMTESUB_OTHER"
                                              ")").replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version_link(
                    "Committee Substitute",
                    comm_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                comm_pdf_url = re.sub("html?", "pdf", comm_version_url)
                bill.add_version_link(
                    "Committee Substitute",
                    comm_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            passed_version = details_root.xpath("string(//PASSED_OTHER"
                                                ")").replace(
                                                    "../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = ("http://billstatus.ls.state.ms.us/" +
                                      passed_version)
                title = "As Passed the " + chamber
                bill.add_version_link(
                    title,
                    passed_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                passed_pdf_url = re.sub("html?", "pdf", passed_version_url)
                bill.add_version_link(
                    title,
                    passed_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            asg_version = details_root.xpath("string(//ASG_OTHER)").replace(
                "../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version_link(
                    "Approved by the Governor",
                    asg_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                asg_pdf_url = re.sub("html?", "pdf", asg_version_url)
                bill.add_version_link(
                    "Approved by the Governor",
                    asg_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            # amendments
            # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml
            for amd in details_root.xpath("//AMENDMENTS/*"):
                if amd.tag == "HAM":
                    name = amd.xpath("HAM_DESC[1]/text()")[0]
                    name = append_parens(amd, "HAM_DISP", name)
                    name = append_parens(amd, "HAM_VDESC", name)

                    pdf_url = amd.xpath("string(HAM_PDF"
                                        ")").replace("../", "")

                    html_url = amd.xpath("string(HAM_OTHER"
                                         ")").replace("../", "")
                elif amd.tag == "SAM":
                    name = amd.xpath("SAM_DESC[1]/text()")[0]
                    name = append_parens(amd, "SAM_DISP", name)
                    name = append_parens(amd, "SAM_VDESC", name)

                    pdf_url = amd.xpath("string(SAM_PDF"
                                        ")").replace("../", "")

                    html_url = amd.xpath("string(SAM_OTHER"
                                         ")").replace("../", "")
                elif amd.tag == "AMRPT":
                    name = amd.xpath("AMRPT_DESC[1]/text()")[0]
                    pdf_url = amd.xpath("string(AMRPT_PDF"
                                        ")").replace("../", "")

                    html_url = amd.xpath("string(AMRPT_OTHER"
                                         ")").replace("../", "")

                pdf_url = "http://billstatus.ls.state.ms.us/" + pdf_url
                html_url = "http://billstatus.ls.state.ms.us/" + html_url

                if "adopted" in name.lower(
                ) or "amendment report" in name.lower():
                    bill.add_version_link(
                        name,
                        pdf_url,
                        on_duplicate="ignore",
                        media_type="application/pdf",
                    )
                    bill.add_version_link(name,
                                          html_url,
                                          on_duplicate="ignore",
                                          media_type="text/html")

            # avoid duplicate votes
            seen_votes = set()

            # Actions
            for action in details_root.xpath("//HISTORY/ACTION"):
                # action_num  = action.xpath('string(ACT_NUMBER)').strip()
                # action_num = int(action_num)
                act_vote = action.xpath("string(ACT_VOTE)").replace(
                    "../../../..", "")
                action_desc = action.xpath("string(ACT_DESC)")
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if "Veto" in action and actor == "executive":
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document_link("Veto", version_url)

                atype = "other"
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(
                    action,
                    self._tz.localize(date),
                    chamber=actor,
                    classification=atype if atype != "other" else None,
                )

                # use committee names as scraped subjects
                subjects = details_root.xpath("//H_NAME/text()")
                subjects += details_root.xpath("//S_NAME/text()")

                for subject in subjects:
                    if subject not in bill.subject:
                        bill.add_subject(subject)

                if act_vote:
                    vote_url = "http://billstatus.ls.state.ms.us%s" % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        yield from self.scrape_votes(vote_url, action, date,
                                                     actor, bill)

            bill.add_source(bill_details_url)
            yield bill
Ejemplo n.º 4
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.to_csv("MAINBILL.TXT")

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec["IdenticalBillNumber"].strip():
                bill.add_related_bill(
                    rec["IdenticalBillNumber"].split()[0],
                    legislative_session=session,
                    relation_type="companion",
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.to_csv("BILLSPON.TXT")

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in sponsor database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == "P":
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(
                name,
                classification=sponsor_type,
                entity_type="person",
                primary=sponsor_type == "primary",
            )

        # Documents
        bill_document_csv = self.to_csv("BILLWP.TXT")

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in document database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split("\\")
            document = document[-2] + "/" + document[-1]

            htm_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format(
                year_abr, document.replace(".DOC", ".HTM"))
            pdf_url = "https://www.njleg.state.nj.us/Bills/{}/{}".format(
                year_abr, document.replace(".DOC", ".PDF"))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec["DocType"]]
            except KeyError:
                raise Exception("unknown doctype %s on %s" %
                                (rec["DocType"], bill_id))
            if rec["Comment"]:
                doc_name += " " + rec["Comment"]

            # Clean links.
            if htm_url.endswith("HTMX"):
                htm_url = re.sub("X$", "", htm_url)
            if pdf_url.endswith("PDFX"):
                pdf_url = re.sub("X$", "", pdf_url)

            if rec["DocType"] in self._version_types:
                if htm_url.lower().endswith("htm"):
                    mimetype = "text/html"
                elif htm_url.lower().endswith("wpd"):
                    mimetype = "application/vnd.wordperfect"
                try:
                    bill.add_version_link(doc_name,
                                          htm_url,
                                          media_type=mimetype)
                    bill.add_version_link(doc_name,
                                          pdf_url,
                                          media_type="application/pdf")
                except ValueError:
                    self.warning(
                        "Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            "A%s" % year_abr,
            "A%s" % next_year,
            "S%s" % year_abr,
            "S%s" % next_year,
            "CA%s-%s" % (year_abr, next_year),
            "CS%s-%s" % (year_abr, next_year),
        ]
        # keep votes clean globally, a few votes show up in multiple files
        votes = {}

        for filename in vote_info_list:
            s_vote_url = f"https://www.njleg.state.nj.us/votes/{filename}.zip"
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.HTTPError:
                self.warning("could not find %s" % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, "r"),
                                                 encoding="latin-1")
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)
                if filename.startswith("A") or filename.startswith("CA"):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith("C"):
                    vote_file_type = "committee"
                else:
                    vote_file_type = "chamber"

                for rec in vdict_file:
                    if vote_file_type == "chamber":
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = "%s%s" % (rec["Bill_Type"],
                                            rec["Bill_Number"])
                        leg = rec["Name"]
                        # drop time portion
                        date = rec["Agenda_Date"].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec["BillAction"]]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec["LegislatorVote"][0:1]
                        committee = rec["Committee_House"]
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = "_".join(vote_parts).replace(" ", "_")

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification="passage",
                            result=None,
                            bill=bill_dict[bill_id],
                        )
                        votes[vote_id].dedupe_key = vote_id
                    if leg_vote == "Y":
                        votes[vote_id].vote("yes", leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote("no", leg)
                    else:
                        votes[vote_id].vote("other", leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count["option"]] += 1
                vote.set_count("yes", counts["yes"])
                vote.set_count("no", counts["no"])
                vote.set_count("other", counts["other"])

                # Veto override.
                if vote.motion_text == "OVERRIDE":
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if "lower" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 54 else "fail"
                    elif "upper" in vote.bill:
                        vote.result = "pass" if counts["yes"] >= 27 else "fail"
                else:
                    # Regular vote.
                    vote.result = "pass" if counts["yes"] > counts[
                        "no"] else "fail"

                vote.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield vote

        # Actions
        bill_action_csv = self.to_csv("BILLHIST.TXT")
        actor_map = {"A": "lower", "G": "executive", "S": "upper"}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in action database" % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = dateutil.parser.parse(date)
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += " " + comment
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.to_csv("BILLSUBJ.TXT")
        for rec in subject_csv:
            bill_id = rec["BillType"].strip() + str(int(rec["BillNumber"]))
            if bill_id not in bill_dict:
                self.warning("unknown bill %s in subject database" % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec["SubjectKey"])
            else:
                self.warning("invalid bill id in BillSubj: %s" % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning("probable phony bill detected %s",
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source("http://www.njleg.state.nj.us/downloads.asp")
                yield bill

        if phony_bill_count:
            self.warning("%s total phony bills detected", phony_bill_count)
Ejemplo n.º 5
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = (
            "http://wyoleg.gov/LsoService/api/BillInformation/{}/"
            "{}?calendarDate=".format(session, bill_num)
        )

        if self.is_special:
            bill_json_url = (
                "http://wyoleg.gov/LsoService/api/BillInformation/{}/"
                "{}?specialSessionValue=1&calendarDate=".format(session[0:4], bill_num)
            )

        try:
            response = self.get(bill_json_url)
            bill_json = json.loads(response.content.decode("utf-8"))
        except scrapelib.HTTPError:
            return None

        chamber = "lower" if bill_json["bill"][0] else "upper"

        bill = Bill(
            identifier=bill_json["bill"],
            legislative_session=session,
            title=bill_json["catchTitle"],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json["billTitle"])

        source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format(
            session, bill_json["bill"]
        )

        if self.is_special:
            source_url = "http://lso.wyoleg.gov/Legislation/{}/{}?specialSessionValue=1".format(
                session[0:4], bill_json["bill"]
            )

        bill.add_source(source_url)

        for action_json in bill_json["billActions"]:
            utc_action_date = self.parse_local_date(action_json["statusDate"])

            actor = None
            if action_json["location"] and action_json["location"] in chamber_map:
                actor = chamber_map[action_json["location"]]

            action = bill.add_action(
                chamber=actor,
                description=action_json["statusMessage"],
                date=utc_action_date,
                classification=categorize_action(action_json["statusMessage"]),
            )

            action.extras = {"billInformationID": action_json["billInformationID"]}

        if bill_json["introduced"]:
            url = "http://wyoleg.gov/{}".format(bill_json["introduced"])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["enrolledAct"]:
            url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["fiscalNote"]:
            url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["digest"]:
            url = "http://wyoleg.gov/{}".format(bill_json["digest"])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf",  # optional but useful!
            )

        if bill_json["vetoes"]:
            for veto in bill_json["vetoes"]:
                url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"])
                bill.add_version_link(
                    note=veto["vetoLinkText"],
                    url=url,
                    media_type="application/pdf",  # optional but useful!
                )

        for amendment in bill_json["amendments"]:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            # TODO: There are no special session amendments yet,
            # but check this url format for specials
            url = "http://wyoleg.gov/{}/Amends/{}.pdf".format(
                session[0:4], amendment["amendmentNumber"]
            )

            if amendment["sponsor"] and amendment["status"]:
                title = "Amendment {} ({}) - {} ({})".format(
                    amendment["amendmentNumber"],
                    amendment["order"],
                    amendment["sponsor"],
                    amendment["status"],
                )
            else:
                title = "Amendment {} ({})".format(
                    amendment["amendmentNumber"], amendment["order"]
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title, url=url, media_type="application/pdf"
            )
            version["extras"] = {
                "amendmentNumber": amendment["amendmentNumber"],
                "sponsor": amendment["sponsor"],
            }

        for sponsor in bill_json["sponsors"]:
            status = "primary" if sponsor["primarySponsor"] else "cosponsor"
            sponsor_type = "person" if sponsor["sponsorTitle"] else "organization"
            bill.add_sponsorship(
                name=sponsor["name"],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor["primarySponsor"],
            )

        if bill_json["summary"]:
            bill.add_abstract(note="summary", abstract=bill_json["summary"])

        if bill_json["enrolledNumber"]:
            bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"]

        if bill_json["chapter"]:
            bill.extras["chapter"] = bill_json["chapter"]

        if bill_json["effectiveDate"]:
            eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y")
            bill.extras["effective_date"] = eff.strftime("%Y-%m-%d")

        bill.extras["wy_bill_id"] = bill_json["id"]

        for vote_json in bill_json["rollCalls"]:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Ejemplo n.º 6
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        for category in self._categories:
            leg_listing_url = (
                self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}"
            )
            resp = requests.post(leg_listing_url, headers=self._headers, verify=False,)
            resp.raise_for_status()
            leg_listing = resp.json()

            for leg in leg_listing:

                bill = Bill(
                    leg["legislationNumber"],
                    legislative_session=session,
                    title=leg["title"],
                    classification=category["name"],
                )
                bill.add_source(leg_listing_url)
                bill_url = (
                    f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}"
                )
                bill.add_source(bill_url)

                if leg['lawNumber']:
                    bill.extras['lawNumber'] = leg['lawNumber']

                # Actions
                for hist in leg["legislationHistory"]:
                    hist_date = datetime.datetime.strptime(
                        hist["actionDate"], "%b %d, %Y"
                    )
                    hist_date = self._TZ.localize(hist_date)
                    hist_action = hist["actionDescription"]
                    if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]:
                        hist_action = hist_action[5:]
                    hist_class = self.classify_action(hist_action)

                    if "mayor" in hist_action.lower():
                        actor = "executive"
                    else:
                        actor = "legislature"
                    bill.add_action(
                        hist_action, hist_date, classification=hist_class, chamber=actor
                    )

                    # Documents with download links
                    if hist["downloadURL"] and ("download" in hist["downloadURL"]):
                        download = hist["downloadURL"]
                        if not download.startswith("http"):
                            download = "https://lims.dccouncil.us/" + download

                        mimetype = (
                            "application/pdf" if download.endswith("pdf") else None
                        )
                        is_version = False
                        # figure out if it's a version from type/name
                        possible_version_types = [
                            "SignedAct",
                            "Introduction",
                            "Enrollment",
                            "Engrossment",
                        ]
                        for vt in possible_version_types:
                            if vt.lower() in download.lower():
                                is_version = True
                                doc_type = vt

                        if "amendment" in download.lower():
                            doc_type = "Amendment"

                        if is_version:
                            bill.add_version_link(
                                doc_type,
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )
                        else:
                            bill.add_document_link(
                                hist["actionDescription"],
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )

                # Grabs Legislation details
                leg_details_url = (
                    self._API_BASE_URL
                    + f"LegislationDetails/{leg['legislationNumber']}"
                )
                details_resp = requests.get(
                    leg_details_url, headers=self._headers, verify=False,
                )
                details_resp.raise_for_status()
                leg_details = details_resp.json()

                # Sponsors
                for i in leg_details["introducers"]:
                    name = i["memberName"]
                    bill.add_sponsorship(
                        name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                # Co-sponsor
                if leg_details["coSponsors"]:
                    for cs in leg_details["coSponsors"]:
                        name = i["memberName"]
                        bill.add_sponsorship(
                            name,
                            classification="cosponsor",
                            entity_type="person",
                            primary=True,
                        )

                # Committee Hearing Doc
                for commHearing in leg_details["committeeHearing"]:
                    if commHearing["hearingRecord"]:
                        bill.add_document_link(
                            commHearing["hearingType"],
                            commHearing["hearingRecord"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                for committeeMarkup in leg_details["committeeMarkup"]:
                    if committeeMarkup["committeeReport"]:
                        bill.add_document_link(
                            "Committee Markup",
                            committeeMarkup["committeeReport"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                # Actions and Votes
                if leg_details["actions"]:
                    # To prevent duplicate votes
                    vote_ids = []
                    for act in leg_details["actions"]:
                        action_name = act["action"]
                        action_date = datetime.datetime.strptime(
                            act["actionDate"][:10], "%Y-%m-%d"
                        )
                        action_date = self._TZ.localize(action_date)

                        if action_name.split()[0] == "Other":
                            action_name = " ".join(action_name.split()[1:])

                        if "mayor" in action_name.lower():
                            actor = "executive"
                        else:
                            actor = "legislature"

                        # Documents and Versions
                        if act["attachment"]:
                            mimetype = (
                                "application/pdf"
                                if act["attachment"].endswith("pdf")
                                else None
                            )
                            is_version = False
                            # figure out if it's a version from type/name
                            possible_version_types = [
                                "SignedAct",
                                "Introduction",
                                "Enrollment",
                                "Engrossment",
                            ]
                            for vt in possible_version_types:
                                if vt.lower() in act["attachment"].lower():
                                    is_version = True
                                    doc_type = vt

                            if "amendment" in act["attachment"].lower():
                                doc_type = "Amendment"

                            if is_version:
                                bill.add_version_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )
                            else:
                                bill.add_document_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )

                        # Votes
                        if act["voteDetails"]:
                            result = act["voteDetails"]["voteResult"]
                            if result:
                                status = self._vote_statuses[result.lower()]
                                id_text = (
                                    str(leg["legislationNumber"])
                                    + "-"
                                    + action_name
                                    + "-"
                                    + result
                                )
                                if id_text not in vote_ids:
                                    vote_ids.append(id_text)
                                    action_class = self.classify_action(action_name)
                                    v = VoteEvent(
                                        identifier=id_text,
                                        chamber=actor,
                                        start_date=action_date,
                                        motion_text=action_name,
                                        result=status,
                                        classification=action_class,
                                        bill=bill,
                                    )
                                    v.add_source(leg_listing_url)

                                    yes_count = (
                                        no_count
                                    ) = absent_count = abstain_count = other_count = 0
                                    for leg_vote in act["voteDetails"]["votes"]:
                                        mem_name = leg_vote["councilMember"]
                                        if leg_vote["vote"] == "Yes":
                                            yes_count += 1
                                            v.yes(mem_name)
                                        elif leg_vote["vote"] == "No":
                                            no_count += 1
                                            v.no(mem_name)
                                        elif leg_vote["vote"] == "Absent":
                                            absent_count += 1
                                            v.vote("absent", mem_name)
                                        elif leg_vote["vote"] == "Recused":
                                            v.vote("abstain", mem_name)
                                            abstain_count += 1
                                        elif leg_vote["vote"] == "Present":
                                            v.vote("other", mem_name)
                                            other_count += 1
                                        else:
                                            # Incase anything new pops up
                                            other_count += 1
                                            v.vote("other", mem_name)

                                    v.set_count("yes", yes_count)
                                    v.set_count("no", no_count)
                                    v.set_count("absent", absent_count)
                                    v.set_count("abstain", abstain_count)
                                    v.set_count("other", other_count)
                                    yield v

                yield bill
Ejemplo n.º 7
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r"<.*?>"

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)["data"] or []

        bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)["data"] or [])

        resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format(
            year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)["data"] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info["BillNumber"].startswith("J.R.H."):
                bill_type = "joint resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("J.R.S."):
                bill_type = "joint resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.R."):
                bill_type = "resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.R."):
                bill_type = "resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("PR."):
                bill_type = "constitutional amendment"
                if info["Body"] == "H":
                    bill_chamber = "lower"
                elif info["Body"] == "S":
                    bill_chamber = "upper"
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info["BillNumber"].startswith("H."):
                bill_type = "bill"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S."):
                bill_type = "bill"
                bill_chamber = "upper"

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info["BillNumber"]))

            bill_id_original_format = (info["BillNumber"].replace(".",
                                                                  "").replace(
                                                                      " ", ""))

            bill_id = bill_id_original_format

            # put one space back in between type and number
            bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info["Title"],
                classification=bill_type,
            )
            if "resolution" in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format(
                year_slug, info["BillNumber"])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                "following-sibling::dd[1]/ul/li")
            sponsor_type = "primary"
            for sponsor in sponsors:
                if sponsor.xpath("span/text()") == ["Additional Sponsors"]:
                    sponsor_type = "cosponsor"
                    continue

                sponsor_name = (sponsor.xpath("a/text()")[0].replace(
                    "Rep.", "").replace("Sen.", "").strip())
                if sponsor_name and not (sponsor_name[:5] == "Less"
                                         and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type="person",
                        primary=(sponsor_type == "primary"),
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                "following-sibling::dd[1]/ul/li/a |"
                '//ul[@class="bill-path"]//a')

            for version in versions:
                if version.xpath("text()"):
                    bill.add_version_link(
                        note=version.xpath("text()")[0],
                        url=version.xpath("@href")[0].replace(" ", "%20"),
                        media_type="application/pdf",
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode("utf-8"),
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(
                    info["BillNumber"]))
                yield bill
                continue

            # Capture actions
            actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format(
                year_slug, internal_bill_id)
            actions_json = self.get(actions_url)

            # Checks if page actually has json posted
            if "json" in actions_json.headers.get("Content-Type"):
                actions = json.loads(actions_json.text)["data"]
                # Checks to see if any data is actually there
                if actions == "":
                    continue
            else:
                continue
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action["FullStatus"]:
                    actor = "executive"
                elif action["ChamberCode"] == "H":
                    actor = "lower"
                elif action["ChamberCode"] == "S":
                    actor = "upper"
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action["FullStatus"]:
                    # assert chambers_passed == set("HS")
                    action_type = "executive-signature"
                elif "Vetoed by the Governor" in action["FullStatus"]:
                    action_type = "executive-veto"
                elif ("Read first time" in action["FullStatus"]
                      or "Read 1st time" in action["FullStatus"]):
                    action_type = "introduction"
                elif "Reported favorably" in action["FullStatus"]:
                    action_type = "committee-passage-favorable"
                elif actor == "lower" and any(
                        x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("H")
                elif actor == "upper" and any(
                        x.lower().startswith(" aspassed")
                        or x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("S")
                else:
                    action_type = None

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.511
                action["StatusDate"] = action["StatusDate"].replace(
                    "/0209", "/2019")

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.754
                if bill_id == "H 754" and session == "2019-2020":
                    action["StatusDate"] = action["StatusDate"].replace(
                        "/0202", "/2020")

                # https://legislature.vermont.gov/bill/status/2020/H.942
                if bill_id == "H 942" and session == "2019-2020":
                    action["StatusDate"] = action["StatusDate"].replace(
                        "/0200", "/2020")

                action_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(action["StatusDate"],
                                               "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                # strftime doesn't always pad year value (%Y)  (https://bugs.python.org/issue32195)
                # and sometimes this state has typos in year part of the StatusDate value
                # which can cause validation errors, so fix leading zeroes if they are missing
                if action_date.find("-") < 4:
                    action_date = ("0" *
                                   (4 - action_date.find("-"))) + action_date

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]),
                    date=action_date,
                    chamber=actor,
                    classification=action_type,
                )

            # Capture votes
            votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)["data"]
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote["VoteHeaderID"]
                roll_call_url = ("http://legislature.vermont.gov/bill/"
                                 "loadBillRollCallDetails/{0}/{1}".format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)["data"]

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name,
                     _district) = member["MemberName"].split(" of ")
                    member_name = member_name.strip()

                    if member["MemberVote"] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member["MemberVote"] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote["FullStatus"]
                        # seems like we've seen both
                        or "Governor overridden" in vote["FullStatus"] or
                        "Governor overriden" in vote["FullStatus"]):
                    did_pass = True
                elif ("Failed -- " in vote["FullStatus"] or
                      "Veto of the Governor sustained" in vote["FullStatus"]):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear: " +
                                         vote["FullStatus"])

                # Check vote counts
                yea_count = int(
                    re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1))
                nay_count = int(
                    re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1))

                vote_start_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                motion_text = re.sub(HTML_TAGS_RE, "",
                                     vote["FullStatus"]).strip()
                vote_identifer = (vote["StatusDate"] + "--" + motion_text +
                                  "--" + roll_call_url)
                vote_to_add = VoteEvent(
                    identifier=vote_identifer,
                    bill=bill,
                    chamber=("lower"
                             if vote["ChamberCode"] == "H" else "upper"),
                    start_date=vote_start_date,
                    motion_text=motion_text,
                    result="pass" if did_pass else "fail",
                    classification="passage",
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count("yes", yea_count)
                vote_to_add.set_count("no", nay_count)
                vote_to_add.set_count("not voting", len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote("not voting", member)

                yield vote_to_add

            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format(
                bill_id_original_format)
            bill.add_document_link(note="Witness List",
                                   url=witnesses_doc_link_url,
                                   media_type="text/html")

            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format(
                bill_id_original_format)
            page = self.lxmlize(conferees_doc_link_url)
            no_data = page.xpath('//div[@class="no-data"]/text()')
            if not no_data:
                bill.add_document_link(
                    note="Conference Committee Members",
                    url=conferees_doc_link_url,
                    media_type="text/html",
                )

            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}
            meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format(
                bill_id_original_format)
            bill.add_document_link(
                note="Committee Meetings",
                url=meetings_doc_link_url,
                media_type="text/html",
            )

            yield bill
Ejemplo n.º 8
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = "%s/%s" % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace("Bill.aspx", "BillContent.aspx")
        url = url.replace("&code=R", "&code=R&style=new")

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath(
            '//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath("//table/tr")
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == "Co-Sponsor:":
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == "LR Number:"
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if (table_rows[4 + cosponsorOffset][0].text_content().strip() ==
                "Governor Action:"):
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == "Bill String:"
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]

        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[3:].strip())
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = "Appropriations Bill"
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title))
                return

        bill = Bill(
            bill_id,
            chamber="lower",
            title=bill_desc,
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note="official")

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(bill_sponsor,
                             entity_type="person",
                             classification="primary",
                             primary=True)

        # check for cosponsors
        (sponsors_url,
         ) = bill_page.xpath("//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        (actions_link,
         ) = bill_page.xpath("//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = "%s%s" % (self._house_base_url,
                                 doc_tag[0].attrib["href"])
            bill.add_document_link(doc, text_url, media_type="text/html")

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == "PDF":
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_version_link(
                    version,
                    vurl.attrib["href"],
                    media_type=mimetype,
                    on_duplicate="ignore",
                )

        # house bill versions
        # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Text")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath(
                    './/div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                if ".pdf" in path:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_version_link(version,
                                      path,
                                      media_type=mimetype,
                                      on_duplicate="ignore")

        # house bill summaries
        # everything between the row containing "Bill Summary" in an h2
        # and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Summary")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[h2[contains(text(),"Bill Summary")]]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath(
                './/div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = "Bill Summary ({})".format(version)
                if ".pdf" in path:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_document_link(summary_name,
                                       path,
                                       media_type=mimetype,
                                       on_duplicate="ignore")

        # house bill amendments
        amendment_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Amendment")]]/'
            'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip(
                )
            path = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip(
                )
            summary_name = "Amendment {}".format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = "{} (Defeated)".format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = "{} (Adopted)".format(summary_name)

            distributed_icon = row.xpath(
                './/img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = "{} (Distributed)".format(summary_name)

            if ".pdf" in path:
                mimetype = "application/pdf"
            else:
                mimetype = "text/html"
            bill.add_version_link(summary_name,
                                  path,
                                  media_type=mimetype,
                                  on_duplicate="ignore")

        yield bill
Ejemplo n.º 9
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
            session[:4],
            bill_id.replace(" ", "-"),
        )
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if (
            "Page Not Found" in html
            or "The bill you are looking for is not available yet" in html
        ):
            url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
                session[-4:],
                bill_id.replace(" ", "-"),
            )
            html = self.get(url).text
            if (
                "Page Not Found" in html
                or "The bill you are looking for is not available yet" in html
            ):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute("http://legislature.mi.gov")

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[
            0
        ].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(" ")[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u"\xa0", " ")
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    "primary"
                    if sponsor.tail and "primary" in sponsor.tail
                    else "cosponsor"
                )
            else:
                classification = "primary"
            bill.add_sponsorship(
                name=name.strip(),
                chamber=chamber,
                entity_type="person",
                primary=classification == "primary",
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath("td")  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            try:
                date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%y"))
            except ValueError:
                try:
                    date = TIMEZONE.localize(
                        datetime.datetime.strptime(date, "%m/%d/%Y")
                    )
                except ValueError:
                    self.warning(
                        "{} has action with invalid date. Skipping Action".format(
                            bill_id
                        )
                    )
                    continue
            # use journal for actor
            # then fall back to upper/lower case
            # Journal entries are often posted with 'Expected Soon' as the cite,
            # then changed to the journal entry.
            if "SJ" in journal.upper():
                actor = "upper"
            elif "HJ" in journal.upper():
                actor = "lower"
            elif action.split()[0].islower():
                actor = "lower"
            elif action.split()[0].isupper():
                actor = "upper"
            else:
                actor = "legislature"

            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(
                r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE
            )
            if submatch and tds[2].xpath("a"):
                version_url = tds[2].xpath("a/@href")[0]
                version_name = tds[2].xpath("a/text()")[0].strip()
                version_name = "Substitute {}".format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith(".pdf"):
                    mimetype = "application/pdf"
                elif version_url.lower().endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath("a/@href")
                if journal_link:
                    objectname = journal_link[0].rsplit("=", 1)[-1]
                    chamber_name = {"upper": "Senate", "lower": "House"}[actor]
                    vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % (
                        session,
                        chamber_name,
                        objectname,
                    )
                    results = self.parse_roll_call(vote_url, rc_num, session)

                    if results is not None:
                        vote_passed = len(results["yes"]) > len(results["no"])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result="pass" if vote_passed else "fail",
                            classification="passage",
                        )

                        # check the expected counts vs actual
                        count = re.search(r"YEAS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["yes"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d"
                                % (bill_id, action, count, len(results["yes"]))
                            )
                        count = re.search(r"NAYS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["no"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d"
                                % (bill_id, action, count, len(results["no"]))
                            )

                        vote.set_count("yes", len(results["yes"]))
                        vote.set_count("no", len(results["no"]))
                        vote.set_count("other", len(results["other"]))
                        possible_vote_results = ["yes", "no", "other"]
                        for pvr in possible_vote_results:
                            for name in results[pvr]:
                                if session == "2017-2018":
                                    names = name.split("\t")
                                    for n in names:
                                        vote.vote(pvr, name.strip())
                                else:
                                    # Prevents voter names like "House Bill No. 4451, entitled" and other sentences
                                    if len(name.split()) < 5:
                                        vote.vote(pvr, name.strip())
                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" % (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith(".pdf"):
                    mimetype = "application/pdf"
                elif url.endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Ejemplo n.º 10
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if ".B. " in bill_id:
            bill_type = "bill"
        elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "):
            bill_type = "resolution"
        elif ".C.R. " in bill_id:
            bill_type = "concurrent resolution"
        elif ".J.R. " in bill_id:
            bill_type = "joint resolution"

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub(r"\s+", " ", bill_id).strip().replace(".", "")

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            try:
                (title, name) = [
                    x.strip() for x in info.xpath(".//text()") if x.strip()
                ]
            except ValueError:
                self.warning(
                    "Could not find sponsor's name for {}".format(bill_id))
                continue
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ",
                                                  "").replace("Rep. ", "")
            bill.add_sponsorship(
                floor_sponsor,
                classification="cosponsor",
                entity_type="person",
                primary=False,
            )
        else:
            self.warning("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]')

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get("href")
            if not url:
                url = version.xpath("following-sibling::a[1]/@href")[0]

            bill.add_version_link(version.xpath("text()")[0].strip(),
                                  url,
                                  media_type="application/pdf")

        for related in page.xpath(
                '//b[text()="Related Documents "]/following-sibling::ul/li/'
                'a[contains(@class,"nlink")]'):
            href = related.xpath("@href")[0]
            if ".fn.pdf" in href:
                bill.add_document_link("Fiscal Note",
                                       href,
                                       media_type="application/pdf")
            else:
                text = related.xpath("text()")[0]
                bill.add_document_link(text,
                                       href,
                                       media_type="application/pdf")

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        if page.xpath('//div[@id="billStatus"]//table'):
            status_table = page.xpath('//div[@id="billStatus"]//table')[0]
            yield from self.parse_status(bill, status_table, chamber)

        yield bill
Ejemplo n.º 11
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = "%s/GetLegislation?biennium=%s&billNumber" "=%s" % (
            self._base_url,
            self.biennium,
            bill_num,
        )

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        xml_chamber = xpath(page, "string(wa:OriginalAgency)")
        chamber = self._chamber_map[xml_chamber]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page, "string(wa:ShortLegislationType/wa:LongLegislationType)"
        )
        bill_type = bill_type.lower()

        if bill_type == "gubernatorial appointment":
            return

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=[bill_type],
        )
        fake_source = (
            "http://apps.leg.wa.gov/billinfo/"
            "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])
        )

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(
                    note=version["note"],
                    url=version["url"],
                    media_type=version["media_type"],
                )
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(
                    note=document["note"],
                    url=document["url"],
                    media_type=document["media_type"],
                )
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, chamber, fake_source)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Ejemplo n.º 12
0
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(name="Adam Smith")
    lower = Organization.objects.create(jurisdiction_id="jid",
                                        name="House",
                                        classification="lower")
    Membership.objects.create(person_id=person.id, organization_id=lower.id)
    Organization.objects.create(
        jurisdiction_id="jid",
        name="Arbitrary Committee",
        classification="committee",
        parent=lower,
    )

    oldbill = ScrapeBill(
        "HB 99",
        "1899",
        "Axe & Tack Tax Act",
        classification="tax bill",
        chamber="lower",
    )

    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      classification="tax bill",
                      chamber="lower")
    bill.subject = ["taxes", "axes"]
    bill.add_identifier("SB 9")
    bill.add_title("Tack & Axe Tax Act")
    bill.add_action("introduced in house", "1900-04-01", chamber="lower")
    act = bill.add_action("sent to arbitrary committee",
                          "1900-04-04",
                          chamber="lower")
    act.add_related_entity(
        "arbitrary committee",
        "organization",
        _make_pseudo_id(name="Arbitrary Committee"),
    )
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship(
        "Adam Smith",
        classification="extra sponsor",
        entity_type="person",
        primary=False,
        entity_id=_make_pseudo_id(name="Adam Smith"),
    )
    bill.add_sponsorship("Jane Smith",
                         classification="lead sponsor",
                         entity_type="person",
                         primary=True)
    bill.add_abstract(
        "This is an act about axes and taxes and tacks.",
        note="official",
        date="1969-10-20",
    )
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.pdf",
                           media_type="application/pdf")
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.html",
                           media_type="text/html")
    bill.add_version_link("Fiscal Note",
                          "http://example.com/v/1",
                          media_type="text/html")
    bill.add_source("http://example.com/source")

    # import bill
    BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier="HB 1")
    assert b.from_organization.classification == "lower"
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ["taxes", "axes"]
    assert b.abstracts.get().note == "official"
    assert b.abstracts.get().date == "1969-10-20"

    # other_title, other_identifier added
    assert b.other_titles.get().title == "Tack & Axe Tax Act"
    assert b.other_identifiers.get().identifier == "SB 9"

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification="lower")
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert actions[1].related_entities.get(
    ).organization == Organization.objects.get(classification="committee")

    # action computed fields
    assert b.first_action_date == "1900-04-01"
    assert b.latest_action_date == "1900-04-04"
    assert b.latest_action_description == "sent to arbitrary committee"

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == "HB 99"

    # and bill got resolved
    assert rb.related_bill.identifier == "HB 99"

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name="Adam Smith")
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Ejemplo n.º 13
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["identifier"]
            self.info("no session specified, using %s", session)
        chamber_types = {
            "H": "lower",
            "S": "upper",
            "G": "executive",
            "C": "legislature",
        }
        session_id = SESSION_SITE_IDS[session]
        self._url_base += session_id + "/"
        bill_url_base = "https://lis.virginia.gov/cgi-bin/"

        self.load_members()
        self.load_sponsors()
        self.load_amendments()
        self.load_history()
        self.load_summaries()
        self.load_votes()
        self.load_bills()

        for bill in self._bills:
            bill = self._bills[bill][0]

            bill_id = bill["bill_id"]
            chamber = chamber_types[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            b = Bill(
                bill_id,
                session,
                bill["bill_description"],
                chamber=chamber,
                classification=bill_type,
            )
            bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}"
            b.add_source(bill_url)

            # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries.
            # Fill in blanks with 0s
            long_bill_id = bill_id
            if len(bill_id) == 3:
                long_bill_id = bill_id[0:2] + "000" + bill_id[-1]
            elif len(bill_id) == 4:
                long_bill_id = bill_id[0:2] + "00" + bill_id[-2:]
            elif len(bill_id) == 5:
                long_bill_id = bill_id[0:2] + "0" + bill_id[-3:]

            # Sponsors
            for spon in self._sponsors[long_bill_id]:
                sponsor_type = spon["patron_type"]
                if sponsor_type.endswith("Chief Patron"):
                    sponsor_type = "primary"
                else:
                    sponsor_type = "cosponsor"
                b.add_sponsorship(
                    spon["member_name"],
                    classification=sponsor_type,
                    entity_type="person",
                    primary=sponsor_type == "primary",
                )

            # Summary
            summary_texts = self._summaries[long_bill_id]
            for sum_text in summary_texts:
                b.add_abstract(sum_text["summary_text"],
                               sum_text["summary_type"])

            # Amendment docs
            amendments = self._amendments[bill_id]
            for amend in amendments:
                doc_link = (
                    bill_url_base +
                    f"legp604.exe?{session_id}+amd+{amend['txt_docid']}")
                b.add_document_link("Amendment: " + amend["txt_docid"],
                                    doc_link,
                                    media_type="text/html")

            # Action text is used to improve version text
            actions_text = []
            # History and then votes
            for hist in self._history[bill_id]:
                action = hist["history_description"]
                action_date = hist["history_date"]
                date = datetime.datetime.strptime(action_date,
                                                  "%m/%d/%y").date()
                chamber = chamber_types[action[0]]
                vote_id = hist["history_refid"]
                cleaned_action = action[2:]
                actions_text.append(cleaned_action)

                # categorize actions
                for pattern, atype in ACTION_CLASSIFIERS:
                    if re.match(pattern, cleaned_action):
                        break
                else:
                    atype = None

                if atype != SKIP:
                    b.add_action(cleaned_action,
                                 date,
                                 chamber=chamber,
                                 classification=atype)

                if len(vote_id) > 0:
                    total_yes = 0
                    total_no = 0
                    total_not_voting = 0
                    total_abstain = 0
                    for v in self._votes[vote_id]:
                        if v["vote_result"] == "yes":
                            total_yes += 1
                        elif v["vote_result"] == "no":
                            total_no += 1
                        elif v["vote_result"] == "not voting":
                            total_not_voting += 1
                        elif v["vote_result"] == "abstain":
                            total_abstain += 1
                    vote = VoteEvent(
                        identifier=vote_id,
                        start_date=date,
                        chamber=chamber,
                        motion_text=cleaned_action,
                        result="pass" if total_yes > total_no else "fail",
                        classification="passage",
                        bill=b,
                    )
                    vote.set_count("yes", total_yes)
                    vote.set_count("no", total_no)
                    vote.set_count("not voting", total_not_voting)
                    vote.set_count("abstain", total_abstain)

                    vote_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}"
                    )
                    vote.add_source(vote_url)
                    for v in self._votes[vote_id]:
                        vote.vote(v["vote_result"], v["member_id"])
                    yield vote

            # Versions
            for version in bill["text_docs"]:
                # Checks if abbr is blank as not every bill has multiple versions
                if len(version["doc_abbr"]) > 0:
                    version_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+ful+{version['doc_abbr']}")
                    version_date = datetime.datetime.strptime(
                        version["doc_date"], "%m/%d/%y").date()
                    version_text = version["doc_abbr"]
                    for act in actions_text:
                        if version_text in act:
                            version_text = act
                    b.add_version_link(
                        version_text,
                        version_url,
                        date=version_date,
                        media_type="text/html",
                        on_duplicate="ignore",
                    )

            yield b
Ejemplo n.º 14
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//span[text()="Title"]')[0].getparent()
        short_title = doc.xpath('//span[text()="Short Title "]')[0].getparent()

        if len(title) > 1 and title[1].text:
            title = title[1].text.strip().strip('"')
        elif len(short_title) > 1 and short_title[1].text:
            self.warning("Falling back to short title on {}".format(url))
            title = short_title[1].text.strip().strip('"')
        else:
            self.warning("skipping bill {}, no Title".format(url))
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = (doc.xpath('//span[contains(text(), "Sponsor(S)")]')
                     [0].getparent()[1].text)
        # Checks if there is a Sponsor string before matching
        if spons_str:
            sponsors_match = re.match(r"(SENATOR|REPRESENTATIVE)", spons_str)
            if sponsors_match:
                sponsors = spons_str.split(",")
                sponsor = sponsors[0].strip()

                if sponsor:
                    bill.add_sponsorship(
                        sponsors[0].split()[1],
                        entity_type="person",
                        classification="primary",
                        primary=True,
                    )

                for sponsor in sponsors[1:]:
                    sponsor = sponsor.strip()
                    if sponsor:
                        bill.add_sponsorship(
                            sponsor,
                            entity_type="person",
                            classification="cosponsor",
                            primary=False,
                        )

            else:
                # Committee sponsorship
                spons_str = spons_str.strip()

                if re.match(r" BY REQUEST OF THE GOVERNOR$", spons_str):
                    spons_str = re.sub(r" BY REQUEST OF THE GOVERNOR$", "",
                                       spons_str).title()
                    spons_str = spons_str + " Committee (by request of the governor)"

                if spons_str:
                    bill.add_sponsorship(
                        spons_str,
                        entity_type="person",
                        classification="primary",
                        primary=True,
                    )

        # Get actions
        self._current_comm = None
        act_rows = doc.xpath("//div[@id='tab6_4']//tr")[1:]
        for row in act_rows:
            date, journal, action = row.xpath("td")
            action = action.text_content().strip()
            raw_chamber = action[0:3]
            journal_entry_number = journal.text_content()
            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  "%m/%d/%Y")
            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            # Votes
            if re.search(r"Y(\d+)", action):
                vote_href = journal.xpath(".//a/@href")
                if vote_href:
                    vote_href = vote_href[0].replace(" ", "")
                    yield from self.parse_vote(
                        bill,
                        journal_entry_number,
                        action,
                        act_chamber,
                        act_date,
                        vote_href,
                    )

            action, atype = self.clean_action(action)

            match = re.search(r"^Prefile released (\d+/\d+/\d+)$", action)
            if match:
                action = "Prefile released"
                act_date = datetime.datetime.strptime(match.group(1),
                                                      "%m/%d/%y")

            bill.add_action(
                action,
                chamber=act_chamber,
                date=act_date.strftime("%Y-%m-%d"),
                classification=atype,
            )

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions - to do
        text_list_url = (
            f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab1_4"
        )
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "/Text/")]'):
            name = link.text_content()
            text_url = link.get("href")
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents - to do
        doc_list_url = (
            f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab5_4"
        )
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        seen = set()
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib["href"]
            if h_name.strip() and h_href not in seen:
                bill.add_document_link(h_name, h_href)
                seen.add(h_href)

        yield bill
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["identifier"]
            self.info("no session specified, using %s", session)
        chamber_types = {
            "H": "lower",
            "S": "upper",
            "G": "executive",
            "C": "legislature",
        }

        # pull the current session's details to tell if it's a special
        session_details = next(
            each for each in self.jurisdiction.legislative_sessions
            if each["identifier"] == session)

        is_special = False
        if ("classification" in session_details
                and session_details["classification"] == "special"):
            is_special = True

        session_id = SESSION_SITE_IDS[session]
        self.init_sftp(session_id)
        bill_url_base = "https://lis.virginia.gov/cgi-bin/"

        if not is_special:
            self.load_members()
            self.load_sponsors()
            self.load_fiscal_notes()
            self.load_summaries()
        self.load_history()
        self.load_votes()
        self.load_bills()

        if not is_special:
            self.load_amendments()

        for bill in self._bills:
            bill = self._bills[bill][0]

            bill_id = bill["bill_id"]
            chamber = chamber_types[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            b = Bill(
                bill_id,
                session,
                bill["bill_description"],
                chamber=chamber,
                classification=bill_type,
            )
            bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}"
            b.add_source(bill_url)

            # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries.
            # Fill in blanks with 0s
            long_bill_id = bill_id
            if len(bill_id) == 3:
                long_bill_id = bill_id[0:2] + "000" + bill_id[-1]
            elif len(bill_id) == 4:
                long_bill_id = bill_id[0:2] + "00" + bill_id[-2:]
            elif len(bill_id) == 5:
                long_bill_id = bill_id[0:2] + "0" + bill_id[-3:]

            # Sponsors
            if long_bill_id not in self._sponsors:
                if "patron_name" in bill and bill["patron_name"].strip() != "":
                    b.add_sponsorship(
                        bill["patron_name"],
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )
            for spon in self._sponsors[long_bill_id]:
                if spon["member_name"].strip() == "":
                    continue

                sponsor_type = spon["patron_type"]
                if sponsor_type.endswith("Chief Patron"):
                    sponsor_type = "primary"
                else:
                    sponsor_type = "cosponsor"
                b.add_sponsorship(
                    spon["member_name"],
                    classification=sponsor_type,
                    entity_type="person",
                    primary=sponsor_type == "primary",
                )

            # Summary
            summary_texts = self._summaries[long_bill_id]
            for sum_text in summary_texts:
                b.add_abstract(sum_text["summary_text"],
                               sum_text["summary_type"])

            # Amendment docs
            amendments = self._amendments[bill_id]
            for amend in amendments:
                doc_link = (
                    bill_url_base +
                    f"legp604.exe?{session_id}+amd+{amend['txt_docid']}")
                b.add_document_link("Amendment: " + amend["txt_docid"],
                                    doc_link,
                                    media_type="text/html")

            # fiscal notes
            for fn in self._fiscal_notes[long_bill_id]:
                doc_link = bill_url_base + f"legp604.exe?{session_id}+oth+{fn['refid']}"
                b.add_document_link(
                    "Fiscal Impact Statement: " + fn["refid"],
                    doc_link.replace(".PDF", "+PDF"),
                    media_type="application/pdf",
                )

            # actions with 8-digit number followed by D are version titles too
            doc_actions = defaultdict(list)
            # History and then votes
            for hist in self._history[bill_id]:
                action = hist["history_description"]
                action_date = hist["history_date"]
                date = datetime.datetime.strptime(action_date,
                                                  "%m/%d/%y").date()
                chamber = chamber_types[action[0]]
                vote_id = hist["history_refid"]
                cleaned_action = action[2:]

                if re.findall(r"\d{8}D", cleaned_action):
                    doc_actions[action_date].append(cleaned_action)

                # categorize actions
                for pattern, atype in ACTION_CLASSIFIERS:
                    if re.match(pattern, cleaned_action):
                        break
                else:
                    atype = None

                if atype != SKIP:
                    b.add_action(cleaned_action,
                                 date,
                                 chamber=chamber,
                                 classification=atype)

                if len(vote_id) > 0:
                    total_yes = 0
                    total_no = 0
                    total_not_voting = 0
                    total_abstain = 0
                    for v in self._votes[vote_id]:
                        if v["vote_result"] == "yes":
                            total_yes += 1
                        elif v["vote_result"] == "no":
                            total_no += 1
                        elif v["vote_result"] == "not voting":
                            total_not_voting += 1
                        elif v["vote_result"] == "abstain":
                            total_abstain += 1
                    vote = VoteEvent(
                        identifier=vote_id,
                        start_date=date,
                        chamber=chamber,
                        motion_text=cleaned_action,
                        result="pass" if total_yes > total_no else "fail",
                        classification="passage",
                        bill=b,
                    )
                    vote.set_count("yes", total_yes)
                    vote.set_count("no", total_no)
                    vote.set_count("not voting", total_not_voting)
                    vote.set_count("abstain", total_abstain)

                    vote_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}"
                    )
                    vote.add_source(vote_url)
                    for v in self._votes[vote_id]:
                        vote.vote(v["vote_result"], v["member_id"])
                    yield vote

            # Versions
            for version in bill["text_docs"]:
                # Checks if abbr is blank as not every bill has multiple versions
                if version["doc_abbr"]:
                    version_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+ful+{version['doc_abbr']}")

                    version_date = datetime.datetime.strptime(
                        version["doc_date"], "%m/%d/%y").date()
                    # version text will default to abbreviation provided in CSV
                    # but if there is an unambiguous action from that date with
                    # a version, we'll use that as the document title
                    version_text = version["doc_abbr"]
                    if len(doc_actions[version["doc_date"]]) == 1:
                        version_text = doc_actions[version["doc_date"]][0]
                    b.add_version_link(
                        version_text,
                        version_url,
                        date=version_date,
                        media_type="text/html",
                        on_duplicate="ignore",
                    )

            yield b
Ejemplo n.º 16
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page, '//div[@class="main-content"]//h2').text.split()

        bill_number = long_title[0]
        title = ""
        for x in range(2, len(long_title)):
            title += long_title[x] + " "
        title = title[0:-1]

        if not title:
            self.error("no title, skipping %s", bill_number)
            return

        bill_type = "resolution" if "LR" in bill_number else "bill"

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            "//body/div[3]/div[2]/div[2]/div/div[3]/div[1]/ul/li[1]/a[1]/text()",
        )

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                "//body/div[3]/div[2]/div[2]/div/div[2]/div[1]/ul/li[1]/text()",
            )
            introduced_by = introduced_by.split("Introduced By:")[1].strip()

        introduced_by = introduced_by.strip()
        bill.add_sponsorship(
            name=introduced_by,
            entity_type="person",
            primary=True,
            classification="primary",
        )

        action_nodes = self.get_nodes(
            bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(action_node, "./td[1]").text
            date = datetime.strptime(date, "%b %d, %Y")

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(action_node, "./td[2]").text_content()

            if "Governor" in action:
                actor = "executive"
            elif "Speaker" in action:
                actor = "legislature"
            else:
                actor = "legislature"

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=action_type,
            )

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page, "/html/body/div[3]/div[2]/div[2]/div/"
            "div[3]/div[2]/ul/li/a")

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib["href"]
            # replace Current w/ session number
            version_url = version_url.replace("Current", session)
            bill.add_version_link(version_name,
                                  version_url,
                                  media_type="application/pdf")

        soi = self.get_nodes(bill_page,
                             ".//a[contains(text(), 'Statement of Intent')]")
        if soi:
            bill.add_document_link("Statement of Intent",
                                   soi[0].get("href"),
                                   media_type="application/pdf")
        comstmt = self.get_nodes(
            bill_page, ".//a[contains(text(), 'Committee Statement')]")
        if comstmt:
            bill.add_document_link(
                "Committee Statement",
                comstmt[0].get("href"),
                media_type="application/pdf",
            )
        fn = self.get_nodes(bill_page, ".//a[contains(text(), 'Fiscal Note')]")
        if fn:
            bill.add_document_link("Fiscal Note",
                                   fn[0].get("href"),
                                   media_type="application/pdf")

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page, ".//div[contains(@class, 'amend-link')]/a")

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib["href"]
            # skip over transcripts
            if "/AM/" not in amendment_url:
                continue
            bill.add_document_link(amendment_name,
                                   amendment_url,
                                   media_type="application/pdf")

        yield bill

        yield from self.scrape_votes(bill, bill_page, actor)
Ejemplo n.º 17
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        withdrawn = False

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                withdrawn = True

        if withdrawn:
            title = "Withdrawn."
        else:
            title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        self.parse_versions(page, bill)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note", source_url, media_type=mimetype)

        # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors.
        for link in page.xpath(
            "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]"
        ):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        if self.parse_bill_field(page, "Summary of Original Version") != "":
            summary = (
                self.parse_bill_field(page, "Summary of Original Version")
                .text_content()
                .strip()
            )
            bill.add_abstract(summary, note="Summary of Original Version")

        if withdrawn:
            action = self.parse_bill_field(page, "Last Action").text_content().strip()
            wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0]
            wd_date = dateutil.parser.parse(wd_date).date()
            bill.add_action(
                action, wd_date, chamber=chamber, classification="withdrawal"
            )

        yield bill
Ejemplo n.º 18
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()
        if not title:
            self.warning("blank bill on %s - skipping", url)
            return

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()
            if "author not found" in name.lower():
                continue

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsorship(
                    name,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
            else:
                bill.add_sponsorship(name,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs["committees"]:
                related_entities.append({"type": "committee", "name": item})
            for item in attrs["legislators"]:
                related_entities.append({"type": "legislator", "name": item})
            bill.add_action(
                description=action,
                date=date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=attrs["classification"],
                related_entities=related_entities,
            )

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if version_url in version_urls:
                self.warning("Skipping duplicate version URL.")
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url,
                         re.IGNORECASE):
                bill.add_document_link(note=name,
                                       url=version_url,
                                       media_type="application/pdf")
                continue

            bill.add_version_link(note=name,
                                  url=version_url,
                                  media_type="application/pdf")

        self.scrape_amendments(bill, page)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                yield from self.scrape_votes(
                    bill, self.urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill.title == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Ejemplo n.º 19
0
    def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(
            page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        def sbp(x):
            return self.scrape_bare_page(
                page.xpath("//a[contains(text(), '%s')]" %
                           (x))[0].attrib["href"])

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        try:
            amendments = sbp("Amendments")
        except IndexError:
            amendments = []

        title = page.xpath(
            "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        title = title.replace("\u00a0\u00a0", " ")
        actions = page.xpath("//div[@id='ctl00_PageBody_PanelBillInfo']/"
                             "/table[@style='font-size:small']/tr")

        bill_id = page.xpath(
            "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = self._bill_types[bill_abbreviation[1:]]
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsorship(author,
                             classification="primary",
                             entity_type="person",
                             primary=True)
        for author in authors:
            bill.add_sponsorship(author,
                                 classification="cosponsor",
                                 entity_type="person",
                                 primary=False)

        for digest in digests:
            bill.add_document_link(
                note=digest.text,
                url=digest.attrib["href"],
                media_type="application/pdf",
            )

        for version in versions:
            bill.add_version_link(
                note=version.text,
                url=version.attrib["href"],
                media_type="application/pdf",
            )

        for amendment in amendments:
            if "href" in amendment.attrib:
                bill.add_version_link(
                    note=amendment.text,
                    url=amendment.attrib["href"],
                    media_type="application/pdf",
                )

        flags = {
            "prefiled": ["filing"],
            "referred to the committee": ["referral-committee"],
            "sent to the house": ["passage"],
            "ordered returned to the house": ["passage"],
            "ordered to the senate": ["passage"],
            "signed by the governor": ["executive-signature"],
            "sent to the governor": ["executive-receipt"],
        }

        try:
            votes_link = page.xpath("//a[text() = 'Votes']")[0]
            yield from self.scrape_votes(bill, votes_link.attrib["href"])
        except IndexError:
            # Some bills don't have any votes
            pass

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            session_year = self.jurisdiction.legislative_sessions[-1][
                "start_date"][0:4]
            # Session is April -> June. Prefiles look like they're in
            # January at earliest.
            date += "/{}".format(session_year)
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = self._chambers[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            bill.add_action(
                description=text,
                date=date.strftime("%Y-%m-%d"),
                chamber=chamber,
                classification=cat,
            )

        yield bill
Ejemplo n.º 20
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type="text/html",
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type="text/html",
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(
                    self.NAME_SLUGS[fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == "Amended":
                atype = "amendment-passage"
            elif desc == "Amendment(s) offered":
                atype = "amendment-introduction"
            elif desc == "Amendment amended":
                atype = "amendment-amendment"
            elif desc == "Amendment withdrawn":
                atype = "amendment-withdrawal"
            elif desc == "Passed" or desc == "Adopted":
                atype = "passage"
            elif re.match(r"^Received (by|from) the", desc):
                if "Secretary of the Senate" not in desc:
                    atype = "introduction"
                else:
                    atype = "filing"
            elif desc.startswith("Sent to the Governor"):
                # But what if it gets lost in the mail?
                atype = "executive-receipt"
            elif desc.startswith("Signed by the Governor"):
                atype = "executive-signature"
            elif desc.startswith("Effective on"):
                atype = "became-law"
            elif desc == "Vetoed by the Governor":
                atype = "executive-veto"
            elif desc == "Read first time":
                atype = ["introduction", "reading-1"]
                introduced = True
            elif desc == "Read & adopted":
                atype = ["passage"]
                if not introduced:
                    introduced = True
                    atype.append("introduction")
            elif desc == "Passed as amended":
                atype = "passage"
            elif desc.startswith("Referred to") or desc.startswith(
                    "Recommended to be sent to "):
                atype = "referral-committee"
            elif desc == "Reported favorably w/o amendment(s)":
                atype = "committee-passage"
            elif desc == "Filed":
                atype = "filing"
            elif desc == "Read 3rd time":
                atype = "reading-3"
            elif desc == "Read 2nd time":
                atype = "reading-2"
            elif desc.startswith("Reported favorably"):
                atype = "committee-passage-favorable"
            else:
                atype = None

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
Ejemplo n.º 21
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[:subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = (page.xpath('//span[@id="lblCompNumber"]/a')
                            [0].text_content().strip())
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']")
                   [0].text_content().split("by")[-1])
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(sponsor,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link("Current Version",
                              btext.get("href"),
                              media_type="application/pdf")

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_version_link(
                "Amendment " + amendment.text,
                amendment.get("href"),
                media_type="application/pdf",
            )
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(afn.get("alt"),
                                   afn.getparent().get("href"),
                                   on_duplicate="ignore")

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")
                [0].text_content().split("by")[-1])
            secondary_sponsor = (secondary_sponsor.replace("*", "").replace(
                ")", "").strip())
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            if page.xpath("//table[@id='gvCoActionHistory']"):
                cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
                actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
Ejemplo n.º 22
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        bill_id_for_url = bill_id.replace(" ", "")
        bill.add_source(
            f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
        )

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        for version in root.iterfind(
                "billtext/docTypes/bill/versions/version"):
            if not version:
                continue

            note = version.find("versionDescription").text
            html_url = version.find("WebHTMLURL").text
            bill.add_version_link(note=note,
                                  url=html_url,
                                  media_type="text/html")
            pdf_url = version.find("WebPDFURL").text
            bill.add_version_link(note=note,
                                  url=pdf_url,
                                  media_type="application/pdf")

        for analysis in root.iterfind(
                "billtext/docTypes/analysis/versions/version"):
            if not analysis:
                continue

            description = analysis.find("versionDescription").text
            html_url = analysis.find("WebHTMLURL").text
            bill.add_document_link(
                note="Analysis ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        for fiscal_note in root.iterfind(
                "billtext/docTypes/fiscalNote/versions/version"):
            if not fiscal_note:
                continue

            description = fiscal_note.find("versionDescription").text
            html_url = fiscal_note.find("WebHTMLURL").text
            bill.add_document_link(
                note="Fiscal Note ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            atype = _categorize_action(desc)

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
Ejemplo n.º 23
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "https://legislature.idaho.gov/legislation/%s/" % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name,
                                      url=href,
                                      media_type="application/pdf")
            else:
                bill.add_document_link(note=name,
                                       url=href,
                                       media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + "/" + session[0:4],
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(actor, date, row[2], session,
                                           bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace("\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
Ejemplo n.º 24
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(
                page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                return

        title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        version_ct = self.parse_versions(page, bill)

        if version_ct < 1:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath(
                "//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Ejemplo n.º 25
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        hist_url = (f"https://www.legis.iowa.gov/legislation/billTracking/"
                    f"billHistory?billName={bill_id}&ga={session_id}")
        req_session = requests.Session()
        req = requests.get(hist_url)
        if req.status_code == 500:
            self.warning("500 error on {}, skipping".format(hist_url))
            return

        page = lxml.html.fromstring(req.text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath('string(//div[@id="content"]/div[@class='
                           '"divideVert"]/div/div[4]/div[2])').strip()

        if title == "":
            # Sometimes the title is moved, see
            # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88
            title = page.xpath('string(//div[@id="content"]/div[@class='
                               '"divideVert"]/div[4]/div[2])').strip()
            if title == "":
                self.warning("URL: %s gives us an *EMPTY* bill. Aborting." %
                             url)
                return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if "HR" in bill_id or "SR" in bill_id:
            bill_type = ["resolution"]
        elif "HJR" in bill_id or "SJR" in bill_id:
            bill_type = ["joint resolution"]
        elif "HCR" in bill_id or "SCR" in bill_id:
            bill_type = ["concurrent resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = (
            "https://www.legis.iowa.gov/docs/"
            "publications/LG{}/{}/attachments/{}.html")
        version_pdf_url_template = ("https://www.legis.iowa.gov/docs/"
                                    "publications/LG{}/{}/{}.pdf")

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath("string(@value)")

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(" ", ""))

                bill.add_version_link(note=version_name,
                                      url=version_html_url,
                                      media_type="text/html")

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(" ", ""))

                if "Marked Up" in version_name:
                    version_pdf_url = sidebar.xpath(
                        "//iframe[@id='bbContextDoc']/@src")[0]

                bill.add_version_link(note=version_name,
                                      url=version_pdf_url,
                                      media_type="application/pdf")

        sponsors_str = page.xpath('string(//div[@id="content"]/div[@class='
                                  '"divideVert"]/div/div[4]/div[1])').strip()

        if re.search("^By ", sponsors_str):
            sponsors = re.split(",| and ", sponsors_str.split("By ")[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall(r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)",
                                  sponsors_str)

        for sponsor in sponsors:
            sponsor = sponsor.replace(" and", "").strip(" .,")

            # a few sponsors get mangled by our regex
            sponsor = {
                "Means": "Ways & Means",
                "Iowa": "Economic Growth/Rebuild Iowa",
                "Safety": "Public Safety",
                "Resources": "Human Resources",
                "Affairs": "Veterans Affairs",
                "Protection": "Environmental Protection",
                "Government": "State Government",
                "Boef": "De Boef",
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        for tr in page.xpath(
                "//table[contains(@class, 'billActionTable')][1]/tbody/tr"):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                for anchor in tr.xpath(".//a"):
                    link_text = anchor.text_content()
                    link_url = anchor.xpath("@href")[0]
                    if "signed" in link_text.lower():
                        bill.add_version_link(note=link_text,
                                              url=link_url,
                                              media_type="application/pdf")
                    elif "acts" in link_text.lower():
                        bill.add_document_link(note=link_text,
                                               url=link_url,
                                               media_type="application/pdf")
                        bill.add_citation(
                            f"IA Acts, {session}",
                            link_text.replace("Acts", ""),
                            citation_type="chapter",
                            url=link_url,
                        )
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[3])").strip()
            action = re.sub(r"\s+", " ", action)

            # Capture any amendment links.
            links = [
                link
                for link in [version["links"] for version in bill.versions]
            ]
            version_urls = [
                link["url"] for link in [i for sub in links for i in sub]
            ]
            if "amendment" in action.lower():
                for anchor in tr.xpath(".//a[1]"):
                    if "-" in anchor.text:
                        # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf
                        amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf"
                        amd_id = anchor.text.replace("-", "").strip()
                        amd_url = amd_pattern.format(session_id, amd_id)
                        amd_name = "Amendment {}".format(anchor.text.strip())

                        if amd_url not in version_urls:
                            bill.add_version_link(note=amd_name,
                                                  url=amd_url,
                                                  media_type="application/pdf")
                            version_urls.append(amd_url)
                        else:
                            self.info(
                                "Already Added {}, skipping".format(amd_url))
            else:
                for anchor in tr.xpath(".//a"):
                    link_text = anchor.text_content()
                    link_url = anchor.xpath("@href")[0]
                    action_date = date.strftime("%m/%d/%Y")
                    if "fiscal" in link_text.lower(
                    ) or "summary" in link_text.lower():
                        # there can be multiple fiscal notes or summaries, so date them
                        doc_title = f"{link_text} {action_date}"
                        bill.add_document_link(note=doc_title,
                                               url=link_url,
                                               media_type="application/pdf")
                    elif "signed" in link_text.lower():
                        bill.add_version_link(note=link_text,
                                              url=link_url,
                                              media_type="application/pdf")
                    elif "acts" in link_text.lower():
                        bill.add_document_link(note=link_text,
                                               url=link_url,
                                               media_type="application/pdf")
                        bill.add_citation(
                            f"IA Acts, {session}",
                            link_text.replace("Acts", ""),
                            citation_type="chapter",
                            url=link_url,
                        )

            if "S.J." in action or "SCS" in action:
                actor = "upper"
            elif "H.J." in action or "HCS" in action:
                actor = "lower"
            else:
                actor = "legislature"

            action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip()

            if action.startswith("Introduced"):
                atype = ["introduction"]
                if ", referred to" in action:
                    atype.append("referral-committee")
            elif action.startswith("Read first time"):
                atype = "reading-1"
            elif action.startswith("Referred to"):
                atype = "referral-committee"
            elif action.startswith("Sent to Governor"):
                atype = "executive-receipt"
            elif action.startswith("Reported Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Vetoed by Governor"):
                atype = "executive-veto"
            elif action.startswith("Item veto"):
                atype = "executive-veto-line-item"
            elif re.match(r"Passed (House|Senate)", action):
                atype = "passage"
            elif re.match(r"Amendment (S|H)-\d+ filed", action):
                atype = ["amendment-introduction"]
                if ", adopted" in action:
                    atype.append("amendment-passage")
            elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted",
                          action):
                atype = "amendment-passage"
            elif re.match(r"Amendment (S|N)-\d+ lost", action):
                atype = "amendment-failure"
            elif action.startswith("Resolution filed"):
                atype = "introduction"
            elif action.startswith("Resolution adopted"):
                atype = "passage"
            elif action.startswith("Committee report") and action.endswith(
                    "passage."):
                atype = "committee-passage"
            elif action.startswith("Withdrawn"):
                atype = "withdrawal"
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search(r"END OF \d+ ACTIONS", action):
                continue

            if "$history" not in action:
                bill.add_action(description=action,
                                date=date,
                                chamber=actor,
                                classification=atype)

        self.scrape_subjects(bill, bill_id, session, req_session)

        yield bill