Ejemplo n.º 1
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.get(url).json()
        api_id = page["BillId"]

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(f"https://sdlegislature.gov/Session/Bill/{api_id}")
        bill.add_source(url)

        version_rows = page["Documents"]
        assert len(version_rows) > 0
        for version in version_rows:
            date = version["DocumentDate"]
            if date:
                match = re.match(r"\d{4}-\d{2}-\d{2}", date)
                date = datetime.datetime.strptime(match.group(0),
                                                  "%Y-%m-%d").date()

                html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}"
                pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf"

                note = version["BillVersion"]
                bill.add_version_link(
                    note,
                    html_link,
                    date=date,
                    media_type="text/html",
                    on_duplicate="ignore",
                )
                bill.add_version_link(
                    note,
                    pdf_link,
                    date=date,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )
            else:
                self.warning("Version listed but no date or documents")

        sponsors = page["BillSponsor"]
        if sponsors:
            for sponsor in sponsors:
                sponsor_type = "person"
                member = sponsor["Member"]
                # first and last name are available, but UniqueName is the old link text
                # could change later?

                bill.add_sponsorship(
                    member["UniqueName"],
                    classification="primary",
                    primary=True,
                    entity_type=sponsor_type,
                )
        else:
            sponsor_type = "organization"
            committee_sponsor = re.search(r">(.*)</a>",
                                          page["BillCommitteeSponsor"])[1]
            bill.add_sponsorship(
                committee_sponsor,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        for keyword in page["Keywords"]:
            bill.add_subject(keyword["Keyword"]["Keyword"])

        actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}"
        yield from self.scrape_action(bill, actions_url, chamber)

        yield bill
Ejemplo n.º 2
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            if "(Introduced by request of another party)" in sponsor:
                sponsor = sponsor.replace(
                    " (Introduced by request of another party)", "")
            b.add_sponsorship(sponsor, "primary", "person", True)

        self.parse_bill_versions_table(b, versions)
        self.parse_testimony(b, bill_page)
        self.parse_cmte_reports(b, bill_page)

        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
Ejemplo n.º 3
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        bill_id_for_url = bill_id.replace(" ", "")
        bill.add_source(
            f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
        )

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        for version in root.iterfind(
                "billtext/docTypes/bill/versions/version"):
            if not version:
                continue

            note = version.find("versionDescription").text
            html_url = version.find("WebHTMLURL").text
            bill.add_version_link(note=note,
                                  url=html_url,
                                  media_type="text/html")
            pdf_url = version.find("WebPDFURL").text
            bill.add_version_link(note=note,
                                  url=pdf_url,
                                  media_type="application/pdf")

        for analysis in root.iterfind(
                "billtext/docTypes/analysis/versions/version"):
            if not analysis:
                continue

            description = analysis.find("versionDescription").text
            html_url = analysis.find("WebHTMLURL").text
            bill.add_document_link(
                note="Analysis ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        for fiscal_note in root.iterfind(
                "billtext/docTypes/fiscalNote/versions/version"):
            if not fiscal_note:
                continue

            description = fiscal_note.find("versionDescription").text
            html_url = fiscal_note.find("WebHTMLURL").text
            bill.add_document_link(
                note="Fiscal Note ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            atype = _categorize_action(desc)

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
Ejemplo n.º 4
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if " " in bill_id:
            bill_id = bill_id[0] + bill_id.split(" ")[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == "Senate":
            chamber = "upper"
        elif chamber == "House":
            chamber = "lower"

        bill_detail_url = (
            "http://www.ncleg.net/gascripts/"
            "BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all"
        ) % (session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0]
        if "Joint Resolution" in title_div_txt:
            bill_type = "joint resolution"
            bill_id = bill_id[0] + "JR " + bill_id[1:]
        elif "Resolution" in title_div_txt:
            bill_type = "resolution"
            bill_id = bill_id[0] + "R " + bill_id[1:]
        elif "Bill" in title_div_txt:
            bill_type = "bill"
            bill_id = bill_id[0] + "B " + bill_id[1:]

        bill_title = doc.xpath("//main//div[@class='col-12'][1]")[0]
        bill_title = bill_title.text_content().strip()

        # For special cases where bill title is blank, a new title is created using Bill ID
        if not bill_title:
            bill_title = bill_id.replace(" ", "")

        bill = Bill(
            bill_id,
            legislative_session=session,
            title=bill_title,
            chamber=chamber,
            classification=bill_type,
        )
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == "lower":
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace("\xa0", " ")
            version_url = vlink.attrib["href"]

            media_type = "text/html"
            if version_url.lower().endswith(".pdf"):
                media_type = "application/pdf"

            bill.add_version_link(version_name,
                                  version_url,
                                  media_type=media_type,
                                  on_duplicate="ignore")

        # rows with a 'adopted' in the text and an amendment link, skip failed amds
        for row in doc.xpath(
                '//div[@class="card-body"]/div[contains(., "Adopted")'
                ' and contains(@class,"row")]//a[@title="Amendment"]'):
            version_url = row.xpath("@href")[0]
            version_name = row.xpath("string(.)").strip()
            bill.add_version_link(
                version_name,
                version_url,
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        # sponsors
        spon_row = doc.xpath(
            '//div[contains(text(), "Sponsors")]/following-sibling::div')[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = "primary"
        spon_lines = spon_row.text_content().replace("\r\n",
                                                     ";").replace("\n", ";")
        for leg in spon_lines.split(";"):
            name = leg.replace("\xa0", " ").strip()
            if name.startswith("(Primary)") or name.endswith("(Primary)"):
                name = name.replace("(Primary)", "").strip()
                spon_type = "cosponsor"
            if not name:
                continue
            bill.add_sponsorship(
                name,
                classification=spon_type,
                entity_type="person",
                primary=(spon_type == "primary"),
            )

        # keywords
        kw_row = doc.xpath(
            '//div[contains(text(), "Keywords:")]/following-sibling::div')[0]
        for subject in kw_row.text_content().split(", "):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = ('//h6[contains(text(), "History")]'
                           '/ancestor::div[contains(@class, "gray-card")]'
                           '//div[contains(@class, "card-body")]'
                           '/div[@class="row"]')

        # skip two header rows
        for row in doc.xpath(action_tr_xpath):
            cols = row.xpath("div")
            act_date = cols[1].text
            actor = cols[3].text or ""
            # if text is blank, try diving in
            action = (cols[5].text
                      or "").strip() or cols[5].text_content().strip()

            if act_date is None:
                search_action_date = action.split()
                for act in search_action_date:
                    try:
                        if "/" in act:
                            # try:
                            act_date = dt.datetime.strptime(
                                act, "%m/%d/%Y").strftime("%Y-%m-%d")
                    except KeyError:
                        raise Exception("No Action Date Provided")
            else:
                act_date = dt.datetime.strptime(
                    act_date, "%m/%d/%Y").strftime("%Y-%m-%d")

            if actor == "Senate":
                actor = "upper"
            elif actor == "House":
                actor = "lower"
            else:
                actor = "executive"

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None
            if act_date is not None:
                bill.add_action(action,
                                act_date,
                                chamber=actor,
                                classification=atype)

        # TODO: Fix vote scraper
        for row in doc.xpath("//h6[@id='vote-header']"):
            yield from self.scrape_votes(bill, doc)

        # For archived votes
        if session in ["1997", "1999"]:
            yield from self.add_archived_votes(bill, bill_id)

        yield bill
Ejemplo n.º 5
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type="text/html",
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type="text/html",
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(
                    self.NAME_SLUGS[fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == "Amended":
                atype = "amendment-passage"
            elif desc == "Amendment(s) offered":
                atype = "amendment-introduction"
            elif desc == "Amendment amended":
                atype = "amendment-amendment"
            elif desc == "Amendment withdrawn":
                atype = "amendment-withdrawal"
            elif desc == "Passed" or desc == "Adopted":
                atype = "passage"
            elif re.match(r"^Received (by|from) the", desc):
                if "Secretary of the Senate" not in desc:
                    atype = "introduction"
                else:
                    atype = "filing"
            elif desc.startswith("Sent to the Governor"):
                # But what if it gets lost in the mail?
                atype = "executive-receipt"
            elif desc.startswith("Signed by the Governor"):
                atype = "executive-signature"
            elif desc.startswith("Effective on"):
                atype = "became-law"
            elif desc == "Vetoed by the Governor":
                atype = "executive-veto"
            elif desc == "Read first time":
                atype = ["introduction", "reading-1"]
                introduced = True
            elif desc == "Read & adopted":
                atype = ["passage"]
                if not introduced:
                    introduced = True
                    atype.append("introduction")
            elif desc == "Passed as amended":
                atype = "passage"
            elif desc.startswith("Referred to") or desc.startswith(
                    "Recommended to be sent to "):
                atype = "referral-committee"
            elif desc == "Reported favorably w/o amendment(s)":
                atype = "committee-passage"
            elif desc == "Filed":
                atype = "filing"
            elif desc == "Read 3rd time":
                atype = "reading-3"
            elif desc == "Read 2nd time":
                atype = "reading-2"
            elif desc.startswith("Reported favorably"):
                atype = "committee-passage-favorable"
            else:
                atype = None

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
Ejemplo n.º 6
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if "INVALID BILL NUMBER" in page:
            self.warning("INVALID BILL %s" % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath("span/text()")[0]

        if "General Bill" in bill_type:
            bill_type = "bill"
        elif "Concurrent Resolution" in bill_type:
            bill_type = "concurrent resolution"
        elif "Joint Resolution" in bill_type:
            bill_type = "joint resolution"
        elif "Resolution" in bill_type:
            bill_type = "resolution"
        else:
            raise ValueError("unknown bill type: %s" % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type,
        )

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                primary=True,
                entity_type="person",
            )
        for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace("\xa0", " ").strip()
            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                primary=True,
                entity_type="organization",
            )

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.text,  # Description of the version from the state;
                #  eg, 'As introduced', 'Amended', etc.
                url=version.get("href"),
                on_duplicate="ignore",
                media_type="text/html",  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath("table/tr"):
            date_td, chamber_td, action_td = row.xpath("td")

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {"Senate": "upper", "House": "lower", None: "legislature"}[
                chamber_td.text
            ]

            action = action_td.text_content()
            action = action.split("(House Journal")[0]
            action = action.split("(Senate Journal")[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime("%Y-%m-%d"),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype,  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Ejemplo n.º 7
0
    def scrape_bills(self, chamber_to_scrape, session):
        url = (
            "http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml"
            % session)

        bill_dir_page = self.get(url)
        root = lxml.etree.fromstring(bill_dir_page.content)
        for mr in root.xpath("//LASTACTION/MSRGROUP"):
            bill_id = mr.xpath("string(MEASURE)").replace(" ", "")
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"

            bill_type = {
                "B": "bill",
                "C": "concurrent resolution",
                "R": "resolution",
                "N": "nomination",
            }[bill_id[1]]

            # just skip past bills that are of the wrong chamber
            if chamber != chamber_to_scrape:
                continue

            link = mr.xpath("string(ACTIONLINK)").replace("..", "")
            main_doc = mr.xpath("string(MEASURELINK)").replace("../../../", "")
            main_doc_url = "http://billstatus.ls.state.ms.us/%s" % main_doc
            bill_details_url = "http://billstatus.ls.state.ms.us/%s/pdf%s" % (
                session,
                link,
            )
            try:
                details_page = self.get(bill_details_url)
            except scrapelib.HTTPError:
                self.warning(
                    "Bill page not loading for {}; skipping".format(bill_id))
                continue

            page = details_page.content
            # Some pages have the (invalid) byte 11 sitting around. Just drop
            # them out. Might as well.

            details_root = lxml.etree.fromstring(page)
            title = details_root.xpath("string(//SHORTTITLE)")
            longtitle = details_root.xpath("string(//LONGTITLE)")

            if title == "":
                self.warning(f"No title yet for {bill_id}, skipping")
                return

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=title,
                classification=bill_type,
            )
            bill.extras["summary"] = longtitle
            bill.add_source(main_doc_url)
            # sponsors
            main_sponsor = details_root.xpath("string(//P_NAME)").split()
            if main_sponsor:
                main_sponsor = main_sponsor[0]
                main_sponsor_link = details_root.xpath(
                    "string(//P_LINK)").replace(" ", "_")
                main_sponsor_url = ("http://billstatus.ls.state.ms.us/%s/"
                                    "pdf/%s") % (
                                        session,
                                        main_sponsor_link.strip("../"),
                                    )
                type = "primary"
                bill.add_source(main_sponsor_url)
                bill.add_sponsorship(
                    self.clean_voter_name(main_sponsor),
                    classification=type,
                    entity_type="person",
                    primary=True,
                )

            for author in details_root.xpath("//AUTHORS/ADDITIONAL"):
                leg = author.xpath("string(CO_NAME)").replace(" ", "_")
                if leg:
                    leg_url = ("http://billstatus.ls.state.ms.us/%s/"
                               "pdf/House_authors/%s.xml") % (session, leg)
                    type = "cosponsor"
                    bill.add_source(leg_url)
                    bill.add_sponsorship(
                        self.clean_voter_name(leg),
                        classification=type,
                        entity_type="person",
                        primary=False,
                    )
            # Versions
            curr_version = details_root.xpath("string(//CURRENT_OTHER"
                                              ")").replace("../../../../", "")
            if curr_version != "":
                curr_version_url = "http://billstatus.ls.state.ms.us/" + curr_version
                bill.add_version_link(
                    "Current version",
                    curr_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                curr_pdf_url = re.sub("html?", "pdf", curr_version_url)
                bill.add_version_link(
                    "Current version",
                    curr_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            intro_version = details_root.xpath(
                "string(//INTRO_OTHER)").replace("../../../../", "")
            if intro_version != "":
                intro_version_url = "http://billstatus.ls.state.ms.us/" + intro_version
                bill.add_version_link(
                    "As Introduced",
                    intro_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                intro_pdf_url = re.sub("html?", "pdf", intro_version_url)
                bill.add_version_link(
                    "As Introduced",
                    intro_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            comm_version = details_root.xpath("string(//CMTESUB_OTHER"
                                              ")").replace("../../../../", "")
            if comm_version.find("documents") != -1:
                comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version
                bill.add_version_link(
                    "Committee Substitute",
                    comm_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                comm_pdf_url = re.sub("html?", "pdf", comm_version_url)
                bill.add_version_link(
                    "Committee Substitute",
                    comm_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            passed_version = details_root.xpath("string(//PASSED_OTHER"
                                                ")").replace(
                                                    "../../../../", "")
            if passed_version.find("documents") != -1:
                passed_version_url = ("http://billstatus.ls.state.ms.us/" +
                                      passed_version)
                title = "As Passed the " + chamber
                bill.add_version_link(
                    title,
                    passed_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                passed_pdf_url = re.sub("html?", "pdf", passed_version_url)
                bill.add_version_link(
                    title,
                    passed_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            asg_version = details_root.xpath("string(//ASG_OTHER)").replace(
                "../../../../", "")
            if asg_version.find("documents") != -1:
                asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version
                bill.add_version_link(
                    "Approved by the Governor",
                    asg_version_url,
                    on_duplicate="ignore",
                    media_type="text/html",
                )
                asg_pdf_url = re.sub("html?", "pdf", asg_version_url)
                bill.add_version_link(
                    "Approved by the Governor",
                    asg_pdf_url,
                    on_duplicate="ignore",
                    media_type="application/pdf",
                )

            # amendments
            # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml
            for amd in details_root.xpath("//AMENDMENTS/*"):
                if amd.tag == "HAM":
                    name = amd.xpath("HAM_DESC[1]/text()")[0]
                    name = append_parens(amd, "HAM_DISP", name)
                    name = append_parens(amd, "HAM_VDESC", name)

                    pdf_url = amd.xpath("string(HAM_PDF"
                                        ")").replace("../", "")

                    html_url = amd.xpath("string(HAM_OTHER"
                                         ")").replace("../", "")
                elif amd.tag == "SAM":
                    name = amd.xpath("SAM_DESC[1]/text()")[0]
                    name = append_parens(amd, "SAM_DISP", name)
                    name = append_parens(amd, "SAM_VDESC", name)

                    pdf_url = amd.xpath("string(SAM_PDF"
                                        ")").replace("../", "")

                    html_url = amd.xpath("string(SAM_OTHER"
                                         ")").replace("../", "")
                elif amd.tag == "AMRPT":
                    name = amd.xpath("AMRPT_DESC[1]/text()")[0]
                    pdf_url = amd.xpath("string(AMRPT_PDF"
                                        ")").replace("../", "")

                    html_url = amd.xpath("string(AMRPT_OTHER"
                                         ")").replace("../", "")

                pdf_url = "http://billstatus.ls.state.ms.us/" + pdf_url
                html_url = "http://billstatus.ls.state.ms.us/" + html_url

                if "adopted" in name.lower(
                ) or "amendment report" in name.lower():
                    bill.add_version_link(
                        name,
                        pdf_url,
                        on_duplicate="ignore",
                        media_type="application/pdf",
                    )
                    bill.add_version_link(name,
                                          html_url,
                                          on_duplicate="ignore",
                                          media_type="text/html")

            # avoid duplicate votes
            seen_votes = set()

            # Actions
            for action in details_root.xpath("//HISTORY/ACTION"):
                # action_num  = action.xpath('string(ACT_NUMBER)').strip()
                # action_num = int(action_num)
                act_vote = action.xpath("string(ACT_VOTE)").replace(
                    "../../../..", "")
                action_desc = action.xpath("string(ACT_DESC)")
                date, action_desc = action_desc.split(" ", 1)
                date = date + "/" + session[0:4]
                date = datetime.strptime(date, "%m/%d/%Y")

                if action_desc.startswith("(H)"):
                    actor = "lower"
                    action = action_desc[4:]
                elif action_desc.startswith("(S)"):
                    actor = "upper"
                    action = action_desc[4:]
                else:
                    actor = "executive"
                    action = action_desc

                if "Veto" in action and actor == "executive":
                    version_path = details_root.xpath("string(//VETO_OTHER)")
                    version_path = version_path.replace("../../../../", "")
                    version_url = "http://billstatus.ls.state.ms.us/" + version_path
                    bill.add_document_link("Veto", version_url)

                atype = "other"
                for prefix, prefix_type in self._action_types:
                    if action.startswith(prefix):
                        atype = prefix_type
                        break

                bill.add_action(
                    action,
                    self._tz.localize(date),
                    chamber=actor,
                    classification=atype if atype != "other" else None,
                )

                # use committee names as scraped subjects
                subjects = details_root.xpath("//H_NAME/text()")
                subjects += details_root.xpath("//S_NAME/text()")

                for subject in subjects:
                    if subject not in bill.subject:
                        bill.add_subject(subject)

                if act_vote:
                    vote_url = "http://billstatus.ls.state.ms.us%s" % act_vote
                    if vote_url not in seen_votes:
                        seen_votes.add(vote_url)
                        yield from self.scrape_votes(vote_url, action, date,
                                                     actor, bill)

            bill.add_source(bill_details_url)
            yield bill
Ejemplo n.º 8
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[:subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = (page.xpath('//span[@id="lblCompNumber"]/a')
                            [0].text_content().strip())
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']")
                   [0].text_content().split("by")[-1])
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(sponsor,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link("Current Version",
                              btext.get("href"),
                              media_type="application/pdf")

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_version_link(
                "Amendment " + amendment.text,
                amendment.get("href"),
                media_type="application/pdf",
            )
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(afn.get("alt"),
                                   afn.getparent().get("href"),
                                   on_duplicate="ignore")

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")
                [0].text_content().split("by")[-1])
            secondary_sponsor = (secondary_sponsor.replace("*", "").replace(
                ")", "").strip())
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            if page.xpath("//table[@id='gvCoActionHistory']"):
                cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
                actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
Ejemplo n.º 9
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "https://legislature.idaho.gov/legislation/%s/" % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name,
                                      url=href,
                                      media_type="application/pdf")
            else:
                bill.add_document_link(note=name,
                                       url=href,
                                       media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + "/" + session[0:4],
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(actor, date, row[2], session,
                                           bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace("\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
Ejemplo n.º 10
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info("no session, using %s", session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {
                "Senate": "upper",
                "House": "lower",
                "House of Representatives": "lower",
                "house": "lower",
                "senate": "upper",
            }

            # so presumably not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {
                "approved": True,
                "passed": True,
                "adopted": True,
                "true": True,
                "false": False,
                "failed": False,
                True: True,
                False: False,
            }

            action_dict = {
                "ref_ctte_100": "referral-committee",
                "intro_100": "introduction",
                "intro_101": "introduction",
                "pass_300": "passage",
                "intro_110": "reading-1",
                "refer_210": "referral-committee",
                "crpt_301": None,
                "crpt_317": None,
                "concur_606": "passage",
                "pass_301": "passage",
                "refer_220": "referral-committee",
                "intro_102": ["introduction", "passage"],
                "intro_105": ["introduction", "passage"],
                "intro_ref_ctte_100": "referral-committee",
                "refer_209": None,
                "intro_108": ["introduction", "passage"],
                "intro_103": ["introduction", "passage"],
                "msg_reso_503": "passage",
                "intro_107": ["introduction", "passage"],
                "imm_consid_360": "passage",
                "refer_213": None,
                "adopt_reso_100": "passage",
                "adopt_reso_110": "passage",
                "msg_507": "amendment-passage",
                "confer_713": None,
                "concur_603": None,
                "confer_712": None,
                "msg_506": "amendment-failure",
                "receive_message_100": "passage",
                "motion_920": None,
                "concur_611": None,
                "confer_735": None,
                "third_429": None,
                "final_501": None,
                "concur_608": None,
                "infpass_217": "passage",
            }

            base_url = "https://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(
                session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(
                first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url,
                                                     "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url,
                                                      "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url,
                                                      "analysiss")

            for row in self.get_bill_rows(session):
                (
                    spacer,
                    number_link,
                    _ga,
                    title,
                    primary_sponsor,
                    status,
                    spacer,
                ) = row.xpath("td")

                # S.R.No.1 -> SR1
                bill_id = number_link.text_content().replace("No.", "")
                bill_id = bill_id.replace(".", "").replace(" ", "")
                # put one space back in between type and number
                bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

                title = title.text_content().strip()
                title = re.sub(r"^Title", "", title)

                chamber = "lower" if "H" in bill_id else "upper"
                classification = "bill" if "B" in bill_id else "resolution"

                if not title and session == "134" and bill_id == "HR 35":
                    # Exception for HR 35 which is a real bill
                    title = "No title provided"
                elif not title:
                    self.warning(f"no title for {bill_id}, skipping")
                    continue
                bill = Bill(
                    bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=classification,
                )
                bill.add_source(number_link.xpath("a/@href")[0])

                if (session, bill_id) in BAD_BILLS:
                    self.logger.warning(
                        f"Skipping details for known bad bill {bill_id}")
                    yield bill
                    continue

                # get bill from API
                bill_api_url = (
                    "https://search-prod.lis.state.oh.us/solarapi/v1/"
                    "general_assembly_{}/{}/{}/".format(
                        session,
                        "bills" if "B" in bill_id else "resolutions",
                        bill_id.lower().replace(" ", ""),
                    ))
                data = self.get(bill_api_url, verify=False).json()
                if len(data["items"]) == 0:
                    self.logger.warning(
                        "Data for bill {bill_id} has empty 'items' array,"
                        " cannot process related information".format(
                            bill_id=bill_id.lower().replace(" ", "")))
                    yield bill
                    continue

                # add title if no short title
                if not bill.title:
                    bill.title = data["items"][0]["longtitle"]
                bill.add_title(data["items"][0]["longtitle"], "long title")

                # this stuff is version-specific
                for version in data["items"]:
                    version_name = version["version"]
                    version_link = base_url + version["pdfDownloadLink"]
                    bill.add_version_link(version_name,
                                          version_link,
                                          media_type="application/pdf")

                # we'll use latest bill_version for everything else
                bill_version = data["items"][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification="cosponsor",
                        entity_type="person",
                        primary=False,
                    )

                try:
                    action_doc = self.get(base_url +
                                          bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning(
                                "Unknown action {desc} with code {code}."
                                " Add it to the action_dict"
                                ".".format(desc=action_desc,
                                           code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(
                            datetime.datetime.strptime(action["datetime"],
                                                       "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date,
                                        chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill,
                                  base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill,
                                  base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill,
                                  base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill,
                                  base_url)

                # votes
                vote_url = base_url + bill_version["votes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not loading; skipping: {}".format(vote_url))
                    yield bill
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not loading; skipping: {}".format(vote_url))
                    yield bill
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(
                        data["items"][0]["effective_date"], "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(
                        effective_action,
                        effective_date,
                        chamber="executive",
                        classification=["became-law"],
                    )

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url + bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url + bill_version["disapprove"][0][
                        "link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError(
                            "Whoa, a disapprove! We've never"
                            " gotten one before."
                            " Go write some code to deal "
                            "with it: {}".format(disapprove_url))

                yield bill
Ejemplo n.º 11
0
    def parse_bill_status_page(self, url, page, list_sponsor, session):
        # list_sponsor passed in to support proposed bills (aka "unintroduced") which have "LC XXXX" bill numbers
        # see 2007 HB 2... weird.
        parsed_url = urllib.parse.urlparse(url)
        parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query))
        if "P_BLTP_BILL_TYP_CD" in parsed_query:
            # normal bill
            bill_id = "{0} {1}".format(parsed_query["P_BLTP_BILL_TYP_CD"],
                                       parsed_query["P_BILL_NO1"])
        elif "P_BILL_DFT_NO5" in parsed_query:
            # proposed bill ("unintroduced")
            bill_id = "{0} {1}".format(
                parsed_query["P_BILL_DFT_NO5"][0:2],
                parsed_query["P_BILL_DFT_NO5"][2:6].lstrip("0"),
            )

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = page.xpath(xp).pop()
        except IndexError:
            title = page.xpath("//tr[1]/td[2]")[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if "b" in _bill_id:
            classification = "bill"
        elif "j" in _bill_id or "jr" in _bill_id:
            classification = "joint resolution"
        elif "cr" in _bill_id:
            classification = "concurrent resolution"
        elif "r" in _bill_id:
            classification = "resolution"
        elif "lc" in _bill_id:
            classification = "proposed bill"

        # chamber
        if _bill_id[0] == "h":
            chamber = "lower"
        elif _bill_id[0] == "s":
            chamber = "upper"
        else:
            # fall back to using the sponsor's chamber
            # used for proposed bills aka unintroducd aka LC bills
            if " HD " in list_sponsor:
                chamber = "lower"
            if " SD " in list_sponsor:
                chamber = "upper"
            else:
                # a true fallback: some sponsors are organizations eg "Economic Affairs Interim Committee"
                chamber = "legislature"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=classification,
        )

        self.add_actions(bill, page)
        votes = self.add_votes(bill, page, url)

        tabledata = self._get_tabledata(page)

        # Add sponsor info.
        if "primary sponsor:" in tabledata and tabledata["primary sponsor:"][0]:
            bill.add_sponsorship(
                tabledata["primary sponsor:"][0],
                classification="primary",
                entity_type="person",
                primary=True,
            )
        elif "(" in list_sponsor:
            # use sponsor data from the bill listing, if it contains a party designation eg (R)
            # used for proposed bills aka unintroducd aka LC bills
            # grab everything before " (R) SD 30" in "John Esp (R) SD 30"
            sponsor_name_raw = re.search(r"(.+) \(", list_sponsor)[1]
            sponsor_name_raw = " ".join(sponsor_name_raw.split(
            ))  # eliminate extra whitespace in middle of name parts
            if sponsor_name_raw:
                bill.add_sponsorship(
                    sponsor_name_raw,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        elif "lc" in _bill_id:
            # probably the sponsor is an organization eg a committee, because LC bills can be sponsored by orgs
            # so just use the sponsor as listed from the index page
            if list_sponsor:
                bill.add_sponsorship(
                    list_sponsor,
                    classification="primary",
                    entity_type="organization",
                    primary=True,
                )

        # A various plus fields MT provides.
        plus_fields = [
            "requester",
            ("chapter number:", "chapter"),
            "transmittal date:",
            "drafter",
            "fiscal note probable:",
            "bill draft number:",
            "preintroduction required:",
            "by request of",
            "category:",
        ]

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(" ", "_")

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in page.xpath(xp):
            try:
                subj = tr.xpath("td")[0].text_content()
            except IndexError:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(page, bill)

        return bill, list(votes)
Ejemplo n.º 12
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = "lower" if chamber.lower() == "house" else chamber
        chamber = "upper" if chamber.lower() == "senate" else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if (transmit_check is not None
                and "has not been transmitted" in transmit_check.strip()):
            self.logger.debug("Bill has not been transmitted to other chamber "
                              "... skipping {0}".format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(
            doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc,
            '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
        )
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc, '//a[text()[contains(.,"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(
                long_desc_page, "//h1/"
                "following-sibling::p/text()")
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = "No title found."
                self.logger.warning("No title found for {}.".format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {
            "F": "bill",
            "R": "resolution",
            "C": "concurrent resolution"
        }[bill_id[1].upper()]
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        # Add source
        bill.add_source(bill_detail_url)

        for subject in self._subject_mapping[bill_id]:
            bill.add_subject(subject)

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
                              '/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        yield bill
Ejemplo n.º 13
0
    def scrape(self, session=None):
        self._bill_prefix_map = {
            "HB": {"type": "bill", "url_segment": "bills/house"},
            "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"},
            "HCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "HC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "SB": {"type": "bill", "url_segment": "bills/senate"},
            "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"},
            "SCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
            "SC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
        }

        api_base_url = "https://api.iga.in.gov"

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # we've put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            disp_bill_id = b["displayName"]

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Bill could not be accessed. Skipping.")
                continue

            title = bill_json["description"]
            if title == "NoneNone":
                title = None
            # sometimes description is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = (
                "lower" if bill_json["originChamber"].lower() == "house" else "upper"
            )
            bill_type = self._bill_prefix_map[bill_prefix]["type"]
            bill = Bill(
                disp_bill_id,
                legislative_session=session,
                chamber=original_chamber,
                title=title,
                classification=bill_type,
            )

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="author")
            for s in bill_json["coauthors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="coauthor")
            for s in bill_json["sponsors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="sponsor")
            for s in bill_json["cosponsors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="cosponsor")

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get(
                    "bill_actions", session=session, bill_id=bill_id.lower()
                )
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace("T", " ")
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if "second reading" in d or "reread second time" in d:
                    action_type.append("reading-2")
                    reading = True

                if "third reading" in d or "reread third time" in d:
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if (
                    "referred" in d
                    and "committee on" in d
                    or "reassigned" in d
                    and "committee on" in d
                ):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if "vetoed by the governor" in d:
                    action_type.append("executive-veto")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning(
                        "Could not recognize an action in '{}'".format(action_desc)
                    )
                    action_type = None

                a = bill.add_action(
                    chamber=action_chamber,
                    description=action_desc,
                    date=date,
                    classification=action_type,
                )
                if committee:
                    a.add_related_entity(committee, entity_type="organization")

            # subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            for subject in subjects:
                bill.add_subject(subject)

            # Abstract
            if bill_json["latestVersion"]["digest"]:
                bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest")

            # put this behind a flag 2021-03-18 (openstates/issues#291)
            if not SCRAPE_WEB_VERSIONS:
                # votes
                yield from self._process_votes(
                    bill_json["latestVersion"]["rollcalls"],
                    disp_bill_id,
                    original_chamber,
                    session,
                )
                # versions
                self.deal_with_version(
                    bill_json["latestVersion"], bill, bill_id, original_chamber, session
                )
                for version in bill_json["versions"][::-1]:
                    self.deal_with_version(
                        version,
                        bill,
                        bill_id,
                        original_chamber,
                        session,
                    )
            else:
                self.scrape_web_versions(session, bill, bill_id)

            yield bill
Ejemplo n.º 14
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//span[text()="Title"]')[0].getparent()
        short_title = doc.xpath('//span[text()="Short Title "]')[0].getparent()

        if len(title) > 1 and title[1].text:
            title = title[1].text.strip().strip('"')
        elif len(short_title) > 1 and short_title[1].text:
            self.warning("Falling back to short title on {}".format(url))
            title = short_title[1].text.strip().strip('"')
        else:
            self.warning("skipping bill {}, no Title".format(url))
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = (doc.xpath('//span[contains(text(), "Sponsor(S)")]')
                     [0].getparent()[1].text)
        # Checks if there is a Sponsor string before matching
        if spons_str:
            sponsors_match = re.match(r"(SENATOR|REPRESENTATIVE)", spons_str)
            if sponsors_match:
                sponsors = spons_str.split(",")
                sponsor = sponsors[0].strip()

                if sponsor:
                    bill.add_sponsorship(
                        sponsors[0].split()[1],
                        entity_type="person",
                        classification="primary",
                        primary=True,
                    )

                for sponsor in sponsors[1:]:
                    sponsor = sponsor.strip()
                    if sponsor:
                        bill.add_sponsorship(
                            sponsor,
                            entity_type="person",
                            classification="cosponsor",
                            primary=False,
                        )

            else:
                # Committee sponsorship
                spons_str = spons_str.strip()

                if re.match(r" BY REQUEST OF THE GOVERNOR$", spons_str):
                    spons_str = re.sub(r" BY REQUEST OF THE GOVERNOR$", "",
                                       spons_str).title()
                    spons_str = spons_str + " Committee (by request of the governor)"

                if spons_str:
                    bill.add_sponsorship(
                        spons_str,
                        entity_type="person",
                        classification="primary",
                        primary=True,
                    )

        # Get actions
        self._current_comm = None
        act_rows = doc.xpath("//div[@id='tab6_4']//tr")[1:]
        for row in act_rows:
            date, journal, action = row.xpath("td")
            action = action.text_content().strip()
            raw_chamber = action[0:3]
            journal_entry_number = journal.text_content()
            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  "%m/%d/%Y")
            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            # Votes
            if re.search(r"Y(\d+)", action):
                vote_href = journal.xpath(".//a/@href")
                if vote_href:
                    vote_href = vote_href[0].replace(" ", "")
                    yield from self.parse_vote(
                        bill,
                        journal_entry_number,
                        action,
                        act_chamber,
                        act_date,
                        vote_href,
                    )

            action, atype = self.clean_action(action)

            match = re.search(r"^Prefile released (\d+/\d+/\d+)$", action)
            if match:
                action = "Prefile released"
                act_date = datetime.datetime.strptime(match.group(1),
                                                      "%m/%d/%y")

            bill.add_action(
                action,
                chamber=act_chamber,
                date=act_date.strftime("%Y-%m-%d"),
                classification=atype,
            )

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions - to do
        text_list_url = (
            f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab1_4"
        )
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "/Text/")]'):
            name = link.text_content()
            text_url = link.get("href")
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents - to do
        doc_list_url = (
            f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab5_4"
        )
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        seen = set()
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib["href"]
            if h_name.strip() and h_href not in seen:
                bill.add_document_link(h_name, h_href)
                seen.add(h_href)

        yield bill
Ejemplo n.º 15
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(url)

        version_rows = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillVersions"]'
            + "/section/table/tbody/tr"
        )
        assert len(version_rows) > 0
        for row in version_rows:
            (date,) = row.xpath('./td[@data-title="Date"]/text()')
            date = date.strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            (html_note,) = row.xpath('./td[@data-title="HTML"]/a/text()')
            (html_link,) = row.xpath('./td[@data-title="HTML"]/a/@href')
            (pdf_note,) = row.xpath('./td[@data-title="PDF"]/a/text()')
            (pdf_link,) = row.xpath('./td[@data-title="PDF"]/a/@href')

            assert html_note == pdf_note
            note = html_note

            bill.add_version_link(
                note,
                html_link,
                date=date,
                media_type="text/html",
                on_duplicate="ignore",
            )
            bill.add_version_link(
                note,
                pdf_link,
                date=date,
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]'
            + '/label[contains(text(), "Sponsors:")]'
            + "/following-sibling::div[1]/p/a"
        )
        for link in sponsor_links:
            if link.attrib["href"].startswith("https://sdlegislature.gov/Legislators/"):
                sponsor_type = "person"
            elif link.attrib["href"].startswith(
                "https://sdlegislature.gov/Legislative_Session/Committees"
            ):
                sponsor_type = "organization"
            else:
                raise ScrapeError(
                    "Found unexpected sponsor, URL: " + link.attrib["href"]
                )
            bill.add_sponsorship(
                link.text,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == "":
                self.debug("Skipping action table row that is completely empty")
                continue

            if "Date" in row.text_content() and "Action" in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith("First read"):
                atypes.append("introduction")
                atypes.append("reading-1")

            if re.match(r"Signed by (?:the\s)*Governor", action, re.IGNORECASE):
                atypes.append("executive-signature")
                actor = "executive"

            match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)", action)
            if match:
                if match.group(1) in ["Senate", "House of Representatives"]:
                    first = ""
                else:
                    first = "committee-"
                if match.group(3).lower() == "passed":
                    second = "passage"
                elif match.group(3).lower() == "failed":
                    second = "failure"
                atypes.append("%s%s" % (first, second))

            if "referred to" in action.lower():
                atypes.append("referral-committee")

            if "Motion to amend, Passed Amendment" in action:
                atypes.append("amendment-introduction")
                atypes.append("amendment-passage")
                if row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]'):
                    amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0]
                    version_name = amd.xpath("string(.)")
                    version_url = amd.xpath("@href")[0]
                    if "htm" in version_url:
                        mimetype = "text/html"
                    elif "pdf" in version_url:
                        mimetype = "application/pdf"
                    bill.add_version_link(
                        version_name,
                        version_url,
                        media_type=mimetype,
                        on_duplicate="ignore",
                    )

            if "Veto override, Passed" in action:
                atypes.append("veto-override-passage")
            elif "Veto override, Failed" in action:
                atypes.append("veto-override-failure")

            if "Delivered to the Governor" in action:
                atypes.append("executive-receipt")

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == "Senate":
                    actor = "upper"
                else:
                    actor = "lower"

            date = row.xpath("string(td[1])").strip()
            match = re.match(r"\d{2}/\d{2}/\d{4}", date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib["href"])

            if action:
                bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Ejemplo n.º 16
0
    def parse_bill_status_page(self, url, page, session, chamber):
        # see 2007 HB 2... weird.
        parsed_url = urllib.parse.urlparse(url)
        parsed_query = dict(urllib.parse.parse_qsl(parsed_url.query))
        bill_id = "{0} {1}".format(
            parsed_query["P_BLTP_BILL_TYP_CD"], parsed_query["P_BILL_NO1"]
        )

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = page.xpath(xp).pop()
        except IndexError:
            title = page.xpath("//tr[1]/td[2]")[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if "b" in _bill_id:
            classification = "bill"
        elif "j" in _bill_id or "jr" in _bill_id:
            classification = "joint resolution"
        elif "cr" in _bill_id:
            classification = "concurrent resolution"
        elif "r" in _bill_id:
            classification = "resolution"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=classification,
        )

        self.add_actions(bill, page)
        votes = self.add_votes(bill, page, url)

        tabledata = self._get_tabledata(page)

        # Add sponsor info.
        bill.add_sponsorship(
            tabledata["primary sponsor:"][0],
            classification="primary",
            entity_type="person",
            primary=True,
        )

        # A various plus fields MT provides.
        plus_fields = [
            "requester",
            ("chapter number:", "chapter"),
            "transmittal date:",
            "drafter",
            "fiscal note probable:",
            "bill draft number:",
            "preintroduction required:",
            "by request of",
            "category:",
        ]

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(" ", "_")

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in page.xpath(xp):
            try:
                subj = tr.xpath("td")[0].text_content()
            except IndexError:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(page, bill)

        return bill, list(votes)