Beispiel #1
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info("no session, using %s", session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {
                "Senate": "upper",
                "House": "lower",
                "House of Representatives": "lower",
                "house": "lower",
                "senate": "upper",
            }

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {
                "approved": True,
                "passed": True,
                "adopted": True,
                "true": True,
                "false": False,
                "failed": False,
                True: True,
                False: False,
            }

            action_dict = {
                "ref_ctte_100": "referral-committee",
                "intro_100": "introduction",
                "intro_101": "introduction",
                "pass_300": "passage",
                "intro_110": "reading-1",
                "refer_210": "referral-committee",
                "crpt_301": None,
                "crpt_317": None,
                "concur_606": "passage",
                "pass_301": "passage",
                "refer_220": "referral-committee",
                "intro_102": ["introduction", "passage"],
                "intro_105": ["introduction", "passage"],
                "intro_ref_ctte_100": "referral-committee",
                "refer_209": None,
                "intro_108": ["introduction", "passage"],
                "intro_103": ["introduction", "passage"],
                "msg_reso_503": "passage",
                "intro_107": ["introduction", "passage"],
                "imm_consid_360": "passage",
                "refer_213": None,
                "adopt_reso_100": "passage",
                "adopt_reso_110": "passage",
                "msg_507": "amendment-passage",
                "confer_713": None,
                "concur_603": None,
                "confer_712": None,
                "msg_506": "amendment-failure",
                "receive_message_100": "passage",
                "motion_920": None,
                "concur_611": None,
                "confer_735": None,
                "third_429": None,
                "final_501": None,
                "concur_608": None,
                "infpass_217": "passage",
            }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(
                session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(
                first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url,
                                                     "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url,
                                                      "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url,
                                                      "analysiss")

            for row in self.get_bill_rows(session):
                (
                    spacer,
                    number_link,
                    _ga,
                    title,
                    primary_sponsor,
                    status,
                    spacer,
                ) = row.xpath("td")

                # S.R.No.1 -> SR1
                bill_id = number_link.text_content().replace("No.", "")
                bill_id = bill_id.replace(".", "").replace(" ", "")
                # put one space back in between type and number
                bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

                title = title.text_content().strip()
                title = re.sub(r"^Title", "", title)

                chamber = "lower" if "H" in bill_id else "upper"
                classification = "bill" if "B" in bill_id else "resolution"

                bill = Bill(
                    bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=classification,
                )
                bill.add_source(number_link.xpath("a/@href")[0])

                # get bill from API
                bill_api_url = (
                    "http://search-prod.lis.state.oh.us/solarapi/v1/"
                    "general_assembly_{}/{}/{}/".format(
                        session,
                        "bills" if "B" in bill_id else "resolutions",
                        bill_id.lower().replace(" ", ""),
                    ))
                data = self.get(bill_api_url).json()
                if len(data["items"]) == 0:
                    self.logger.warning(
                        "Data for bill {bill_id} has empty 'items' array,"
                        " cannot process related information".format(
                            bill_id=bill_id.lower().replace(" ", "")))
                    yield bill
                    continue

                # add title if no short title
                if not bill.title:
                    bill.title = data["items"][0]["longtitle"]
                bill.add_title(data["items"][0]["longtitle"], "long title")

                # this stuff is version-specific
                for version in data["items"]:
                    version_name = version["version"]
                    version_link = base_url + version["pdfDownloadLink"]
                    bill.add_version_link(version_name,
                                          version_link,
                                          media_type="application/pdf")

                # we'll use latest bill_version for everything else
                bill_version = data["items"][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification="cosponsor",
                        entity_type="person",
                        primary=False,
                    )

                try:
                    action_doc = self.get(base_url +
                                          bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning(
                                "Unknown action {desc} with code {code}."
                                " Add it to the action_dict"
                                ".".format(desc=action_desc,
                                           code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(
                            datetime.datetime.strptime(action["datetime"],
                                                       "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date,
                                        chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill,
                                  base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill,
                                  base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill,
                                  base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill,
                                  base_url)

                # votes
                vote_url = base_url + bill_version["votes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not loading; skipping: {}".format(vote_url))
                    yield bill
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning(
                        "Vote page not loading; skipping: {}".format(vote_url))
                    yield bill
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(
                    votes,
                    vote_url,
                    base_url,
                    bill,
                    legislators,
                    chamber_dict,
                    vote_results,
                )

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(
                        data["items"][0]["effective_date"], "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(
                        effective_action,
                        effective_date,
                        chamber="executive",
                        classification=["became-law"],
                    )

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url + bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url + bill_version["disapprove"][0][
                        "link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError(
                            "Whoa, a disapprove! We've never"
                            " gotten one before."
                            " Go write some code to deal "
                            "with it: {}".format(disapprove_url))

                yield bill
Beispiel #2
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = "lower" if chamber.lower() == "house" else chamber
        chamber = "upper" if chamber.lower() == "senate" else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if (transmit_check is not None
                and "has not been transmitted" in transmit_check.strip()):
            self.logger.debug("Bill has not been transmitted to other chamber "
                              "... skipping {0}".format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(
            doc, '//h1[contains(@class,"card-title float-left mr-4")]/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc,
            '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
        )
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc, '//a[text()[contains(.,"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(
                long_desc_page, "//h1/"
                "following-sibling::p/text()")
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = "No title found."
                self.logger.warning("No title found for {}.".format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {
            "F": "bill",
            "R": "resolution",
            "C": "concurrent resolution"
        }[bill_id[1].upper()]
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        # Add source
        bill.add_source(bill_detail_url)

        for subject in self._subject_mapping[bill_id]:
            bill.add_subject(subject)

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
                              '/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        yield bill
Beispiel #3
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if " " in bill_id:
            bill_id = bill_id[0] + bill_id.split(" ")[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == "Senate":
            chamber = "upper"
        elif chamber == "House":
            chamber = "lower"

        bill_detail_url = (
            "http://www.ncleg.net/gascripts/"
            "BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all"
        ) % (session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0]
        if "Joint Resolution" in title_div_txt:
            bill_type = "joint resolution"
            bill_id = bill_id[0] + "JR " + bill_id[1:]
        elif "Resolution" in title_div_txt:
            bill_type = "resolution"
            bill_id = bill_id[0] + "R " + bill_id[1:]
        elif "Bill" in title_div_txt:
            bill_type = "bill"
            bill_id = bill_id[0] + "B " + bill_id[1:]

        bill_title = doc.xpath("//main//div[@class='col-12'][1]")[0]
        bill_title = bill_title.text_content().strip()

        # For special cases where bill title is blank, a new title is created using Bill ID
        if not bill_title:
            bill_title = bill_id.replace(" ", "")

        bill = Bill(
            bill_id,
            legislative_session=session,
            title=bill_title,
            chamber=chamber,
            classification=bill_type,
        )
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == "lower":
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace("\xa0", " ")
            version_url = vlink.attrib["href"]

            media_type = "text/html"
            if version_url.lower().endswith(".pdf"):
                media_type = "application/pdf"

            bill.add_version_link(
                version_name, version_url, media_type=media_type, on_duplicate="ignore"
            )

        # rows with a 'adopted' in the text and an amendment link, skip failed amds
        for row in doc.xpath(
            '//div[@class="card-body"]/div[contains(., "Adopted")'
            ' and contains(@class,"row")]//a[@title="Amendment"]'
        ):
            version_url = row.xpath("@href")[0]
            version_name = row.xpath("string(.)").strip()
            bill.add_version_link(
                version_name,
                version_url,
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        # sponsors
        spon_row = doc.xpath(
            '//div[contains(text(), "Sponsors")]/following-sibling::div'
        )[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = "primary"
        spon_lines = spon_row.text_content().replace("\r\n", ";").replace("\n", ";")
        for leg in spon_lines.split(";"):
            name = leg.replace("\xa0", " ").strip()
            if name.startswith("(Primary)") or name.endswith("(Primary)"):
                name = name.replace("(Primary)", "").strip()
                spon_type = "cosponsor"
            if not name:
                continue
            bill.add_sponsorship(
                name,
                classification=spon_type,
                entity_type="person",
                primary=(spon_type == "primary"),
            )

        # keywords
        kw_row = doc.xpath(
            '//div[contains(text(), "Keywords:")]/following-sibling::div'
        )[0]
        for subject in kw_row.text_content().split(", "):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = (
            '//h6[contains(text(), "History")]'
            '/ancestor::div[contains(@class, "gray-card")]'
            '//div[contains(@class, "card-body")]'
            '/div[@class="row"]'
        )

        # skip two header rows
        for row in doc.xpath(action_tr_xpath):
            cols = row.xpath("div")
            act_date = cols[1].text
            actor = cols[3].text or ""
            # if text is blank, try diving in
            action = (cols[5].text or "").strip() or cols[5].text_content().strip()

            if act_date is None:
                search_action_date = action.split()
                for act in search_action_date:
                    try:
                        if "/" in act:
                            # try:
                            act_date = dt.datetime.strptime(act, "%m/%d/%Y").strftime(
                                "%Y-%m-%d"
                            )
                    except KeyError:
                        raise Exception("No Action Date Provided")
            else:
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y").strftime(
                    "%Y-%m-%d"
                )

            if actor == "Senate":
                actor = "upper"
            elif actor == "House":
                actor = "lower"
            else:
                actor = "executive"

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None
            if act_date is not None:
                bill.add_action(action, act_date, chamber=actor, classification=atype)

        # TODO: Fix vote scraper
        for row in doc.xpath("//h6[@id='vote-header']"):
            yield from self.scrape_votes(bill, doc)

        # For archived votes
        if session in ["1997", "1999"]:
            yield from self.add_archived_votes(bill, bill_id)

        yield bill
Beispiel #4
0
    def scrape(self, session=None):
        for category in self._categories:
            leg_listing_url = (self._API_BASE_URL +
                               f"BulkData/{category['categoryId']}/{session}")
            resp = self.post(leg_listing_url,
                             headers=self._headers,
                             verify=False)
            resp.raise_for_status()
            leg_listing = resp.json()

            for leg in leg_listing:

                bill = Bill(
                    leg["legislationNumber"],
                    legislative_session=session,
                    title=leg["title"],
                    classification=category["name"],
                )
                bill.add_source(leg_listing_url)
                bill_url = (
                    f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}"
                )
                bill.add_source(bill_url)

                if leg["lawNumber"]:
                    bill.extras["lawNumber"] = leg["lawNumber"]

                # Actions
                for hist in leg["legislationHistory"]:
                    hist_date = datetime.datetime.strptime(
                        hist["actionDate"], "%b %d, %Y")
                    hist_date = self._TZ.localize(hist_date)
                    hist_action = hist["actionDescription"]
                    if hist_action.split()[0] in [
                            "OtherAmendment", "OtherMotion"
                    ]:
                        hist_action = hist_action[5:]
                    hist_class = self.classify_action(hist_action)

                    if "mayor" in hist_action.lower():
                        actor = "executive"
                    else:
                        actor = "legislature"
                    bill.add_action(hist_action,
                                    hist_date,
                                    classification=hist_class,
                                    chamber=actor)

                    # Documents with download links
                    if hist["downloadURL"] and ("download"
                                                in hist["downloadURL"]):
                        download = hist["downloadURL"]
                        if not download.startswith("http"):
                            download = "https://lims.dccouncil.us/" + download

                        mimetype = ("application/pdf"
                                    if download.endswith("pdf") else None)
                        is_version = False
                        # figure out if it's a version from type/name
                        possible_version_types = [
                            "SignedAct",
                            "Introduction",
                            "Enrollment",
                            "Engrossment",
                        ]
                        for vt in possible_version_types:
                            if vt.lower() in download.lower():
                                is_version = True
                                doc_type = vt

                        if "amendment" in download.lower():
                            doc_type = "Amendment"

                        if is_version:
                            bill.add_version_link(
                                doc_type,
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )
                        else:
                            bill.add_document_link(
                                hist["actionDescription"],
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )

                # Grabs Legislation details
                leg_details_url = (
                    self._API_BASE_URL +
                    f"LegislationDetails/{leg['legislationNumber']}")
                details_resp = self.get(leg_details_url,
                                        headers=self._headers,
                                        verify=False)
                details_resp.raise_for_status()
                leg_details = details_resp.json()

                # Sponsors
                for i in leg_details["introducers"]:
                    name = i["memberName"]
                    bill.add_sponsorship(
                        name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                # Co-sponsor
                if leg_details["coSponsors"]:
                    for cs in leg_details["coSponsors"]:
                        name = i["memberName"]
                        bill.add_sponsorship(
                            name,
                            classification="cosponsor",
                            entity_type="person",
                            primary=True,
                        )

                # Committee Hearing Doc
                for commHearing in leg_details["committeeHearing"]:
                    if commHearing["hearingRecord"]:
                        bill.add_document_link(
                            commHearing["hearingType"],
                            commHearing["hearingRecord"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                for committeeMarkup in leg_details["committeeMarkup"]:
                    if committeeMarkup["committeeReport"]:
                        bill.add_document_link(
                            "Committee Markup",
                            committeeMarkup["committeeReport"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                # Actions and Votes
                if leg_details["actions"]:
                    # To prevent duplicate votes
                    vote_ids = []
                    for act in leg_details["actions"]:
                        action_name = act["action"]
                        action_date = datetime.datetime.strptime(
                            act["actionDate"][:10], "%Y-%m-%d")
                        action_date = self._TZ.localize(action_date)

                        if action_name.split()[0] == "Other":
                            action_name = " ".join(action_name.split()[1:])

                        if "mayor" in action_name.lower():
                            actor = "executive"
                        else:
                            actor = "legislature"

                        # Documents and Versions
                        if act["attachment"]:
                            mimetype = ("application/pdf"
                                        if act["attachment"].endswith("pdf")
                                        else None)
                            is_version = False
                            # figure out if it's a version from type/name
                            possible_version_types = [
                                "SignedAct",
                                "Introduction",
                                "Enrollment",
                                "Engrossment",
                            ]
                            for vt in possible_version_types:
                                if vt.lower() in act["attachment"].lower():
                                    is_version = True
                                    doc_type = vt

                            if "amendment" in act["attachment"].lower():
                                doc_type = "Amendment"

                            if is_version:
                                bill.add_version_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )
                            else:
                                bill.add_document_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )

                        # Votes
                        if act["voteDetails"]:
                            result = act["voteDetails"]["voteResult"]
                            if result:
                                status = self._vote_statuses[result.lower()]
                                id_text = (str(leg["legislationNumber"]) +
                                           "-" + action_name + "-" + result)
                                if id_text not in vote_ids:
                                    vote_ids.append(id_text)
                                    action_class = self.classify_action(
                                        action_name)
                                    v = VoteEvent(
                                        identifier=id_text,
                                        chamber=actor,
                                        start_date=action_date,
                                        motion_text=action_name,
                                        result=status,
                                        classification=action_class,
                                        bill=bill,
                                    )
                                    v.add_source(leg_listing_url)

                                    yes_count = (
                                        no_count
                                    ) = absent_count = abstain_count = other_count = 0
                                    for leg_vote in act["voteDetails"][
                                            "votes"]:
                                        mem_name = leg_vote["councilMember"]
                                        if leg_vote["vote"] == "Yes":
                                            yes_count += 1
                                            v.yes(mem_name)
                                        elif leg_vote["vote"] == "No":
                                            no_count += 1
                                            v.no(mem_name)
                                        elif leg_vote["vote"] == "Absent":
                                            absent_count += 1
                                            v.vote("absent", mem_name)
                                        elif leg_vote["vote"] == "Recused":
                                            v.vote("abstain", mem_name)
                                            abstain_count += 1
                                        elif leg_vote["vote"] == "Present":
                                            v.vote("other", mem_name)
                                            other_count += 1
                                        else:
                                            # Incase anything new pops up
                                            other_count += 1
                                            v.vote("other", mem_name)

                                    v.set_count("yes", yes_count)
                                    v.set_count("no", no_count)
                                    v.set_count("absent", absent_count)
                                    v.set_count("abstain", abstain_count)
                                    v.set_count("other", other_count)
                                    yield v

                yield bill
Beispiel #5
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "https://legislature.idaho.gov/legislation/%s/" % session
        )
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(
                    note=name, url=href, media_type="application/pdf"
                )

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(
                date + "/" + session[0:4], "%m/%d/%Y"
            ).strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(
                    actor, date, row[2], session, bill_id, chamber, url
                )
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u"\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
Beispiel #6
0
    def scrape_chamber(self, chamber, session):
        if int(session) < 2017:
            legacy = NHLegacyBillScraper(self.metadata, self.datadir)
            yield from legacy.scrape(chamber, session)
            # This throws an error because object_count isn't being properly incremented,
            # even though it saves fine. So fake the output_names
            self.output_names = ["1"]
            return

        # bill basics
        self.bills = {}  # LSR->Bill
        self.bills_by_id = {}  # need a second table to attach votes
        self.versions_by_lsr = {}  # mapping of bill ID to lsr
        self.amendments_by_lsr = {}

        # pre load the mapping table of LSR -> version id
        self.scrape_version_ids()
        self.scrape_amendments()

        last_line = []
        for line in (self.get(
                f"http://www.gencourt.state.nh.us/dynamicdatadump/LSRs.txt?x={self.cachebreaker}"
        ).content.decode("utf-8").split("\n")):
            line = line.split("|")
            if len(line) < 1:
                continue

            if len(line) < 36:
                if len(last_line + line[1:]) == 36:
                    # combine two lines for processing
                    # (skip an empty entry at beginning of second line)
                    line = last_line + line
                    self.warning("used bad line")
                else:
                    # skip this line, maybe we'll use it later
                    self.warning("bad line: %s" % "|".join(line))
                    last_line = line
                    continue
            session_yr = line[0]
            lsr = line[1]
            title = line[2]
            body = line[3]
            # type_num = line[4]
            expanded_bill_id = line[9]
            bill_id = line[10]

            if body == body_code[chamber] and session_yr == session:
                if expanded_bill_id.startswith("CACR"):
                    bill_type = "constitutional amendment"
                elif expanded_bill_id.startswith("PET"):
                    bill_type = "petition"
                elif expanded_bill_id.startswith("AR") and bill_id.startswith(
                        "CACR"):
                    bill_type = "constitutional amendment"
                elif expanded_bill_id.startswith(
                        "SSSB") or expanded_bill_id.startswith("SSHB"):
                    # special session house/senate bills
                    bill_type = "bill"
                else:
                    bill_type = bill_type_map[expanded_bill_id.split(" ")[0]
                                              [1:]]

                if title.startswith("("):
                    title = title.split(")", 1)[1].strip()

                self.bills[lsr] = Bill(
                    legislative_session=session,
                    chamber=chamber,
                    identifier=bill_id,
                    title=title,
                    classification=bill_type,
                )

                # check to see if resolution, process versions by getting lsr off link on the bill source page
                if re.match(r"^.R\d+", bill_id):
                    # ex: HR 1 is lsr=847 but version id=838
                    resolution_url = (
                        "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/bill_status.aspx?"
                        + "lsr={}&sy={}&txtsessionyear={}".format(
                            lsr, session, session))
                    resolution_page = self.get(
                        resolution_url,
                        allow_redirects=True).content.decode("utf-8")
                    page = lxml.html.fromstring(resolution_page)
                    version_href = page.xpath("//a[2]/@href")[1]
                    true_version = re.search(r"id=(\d+)&", version_href)[1]
                    self.versions_by_lsr[lsr] = true_version

                # http://www.gencourt.state.nh.us/bill_status/billText.aspx?sy=2017&id=95&txtFormat=html
                # or if 2022 bills
                # http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/billText.aspx?id=1410&txtFormat=html&sy=2022
                if lsr in self.versions_by_lsr:
                    version_id = self.versions_by_lsr[lsr]
                    version_url = (
                        "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/"
                        "billText.aspx?sy={}&id={}&txtFormat=html".format(
                            session, version_id))

                    pdf_version_url = (
                        "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/"
                        "billText.aspx?sy={}&id={}&txtFormat=pdf&v=current".
                        format(session, version_id))
                    latest_version_name = "latest version"
                    self.bills[lsr].add_version_link(
                        note=latest_version_name,
                        url=version_url,
                        media_type="text/html",
                    )
                    self.bills[lsr].add_version_link(
                        note=latest_version_name,
                        url=pdf_version_url,
                        media_type="application/pdf",
                    )

                # http://gencourt.state.nh.us/bill_status/billtext.aspx?sy=2017&txtFormat=amend&id=2017-0464S
                if lsr in self.amendments_by_lsr:
                    amendment_id = self.amendments_by_lsr[lsr]
                    amendment_url = (
                        "http://www.gencourt.state.nh.us/bill_status/legacy/bs2016/"
                        "billText.aspx?sy={}&id={}&txtFormat=amend".format(
                            session, amendment_id))
                    amendment_name = "Amendment #{}".format(amendment_id)

                    self.bills[lsr].add_version_link(
                        note=amendment_name,
                        url=amendment_url,
                        media_type="application/pdf",
                    )

                self.bills_by_id[bill_id] = self.bills[lsr]

        # load legislators
        self.legislators = {}
        for line in (self.get(
                "http://www.gencourt.state.nh.us/dynamicdatadump/legislators.txt?x={}"
                .format(
                    self.cachebreaker)).content.decode("utf-8").split("\n")):
            if len(line) < 2:
                continue

            line = line.split("|")
            employee_num = line[0].replace("\ufeff", "")

            # first, last, middle
            if len(line) > 2:
                name = "%s %s %s" % (line[2], line[3], line[1])
            else:
                name = "%s %s" % (line[2], line[1])

            self.legislators[employee_num] = {"name": name, "seat": line[5]}
            # body = line[4]

        # sponsors
        for line in (self.get(
                f"http://www.gencourt.state.nh.us/dynamicdatadump/LsrSponsors.txt?x={self.cachebreaker}"
        ).content.decode("utf-8").split("\n")):
            if len(line) < 1:
                continue

            session_yr, lsr, _seq, employee, primary = line.strip().split("|")
            lsr = lsr.zfill(4)
            if session_yr == session and lsr in self.bills:
                sp_type = "primary" if primary == "1" else "cosponsor"
                try:
                    # Removes extra spaces in names
                    sponsor_name = self.legislators[employee]["name"].strip()
                    sponsor_name = " ".join(sponsor_name.split())
                    self.bills[lsr].add_sponsorship(
                        classification=sp_type,
                        name=sponsor_name,
                        entity_type="person",
                        primary=True if sp_type == "primary" else False,
                    )
                    self.bills[lsr].extras = {
                        "_code": self.legislators[employee]["seat"]
                    }
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)

        # actions
        for line in (self.get(
                f"http://www.gencourt.state.nh.us/dynamicdatadump/Docket.txt?x={self.cachebreaker}"
        ).content.decode("utf-8").split("\n")):
            if len(line) < 1:
                continue
            # a few blank/irregular lines, irritating
            if "|" not in line:
                continue

            (session_yr, lsr, timestamp, bill_id, body, action,
             _) = line.split("|")

            if session_yr == session and lsr in self.bills:
                actor = "lower" if body == "H" else "upper"
                time = dt.datetime.strptime(timestamp, "%m/%d/%Y %H:%M:%S %p")
                action = action.strip()
                atype = classify_action(action)
                self.bills[lsr].add_action(
                    chamber=actor,
                    description=action,
                    date=time.strftime("%Y-%m-%d"),
                    classification=atype,
                )
                amendment_id = extract_amendment_id(action)
                if amendment_id:
                    self.bills[lsr].add_document_link(
                        note="amendment %s" % amendment_id,
                        url=AMENDMENT_URL % amendment_id,
                        on_duplicate="ignore",
                    )

        yield from self.scrape_votes(session)

        # save all bills
        for bill in self.bills:
            # bill.add_source(zip_url)
            self.add_source(self.bills[bill], bill, session)
            yield self.bills[bill]
Beispiel #7
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath("td[1]/font/input/@value")
            (sponsor, ) = bill_info.xpath("td[2]/font/input/@value")
            (subject, ) = bill_info.xpath("td[3]//text()")
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if "B" in bill_id:
                bill_type = "bill"
            elif "JR" in bill_id:
                bill_type = "joint resolution"
            elif "R" in bill_id:
                bill_type = "resolution"
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title="",
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="person",
                    classification="primary",
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ("http://alisondb.legislature.state.al.us/Alison/"
                        "SESSBillStatusResult.aspx?BILL={}".format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning(
                    "Bill {} has no webpage, and will be skipped".format(
                        bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'):
                title = (bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]')
                         [0].text_content().strip())
            if not title:
                title = "[No title given by state]"
            bill.title = title
            session = "2021FS" if self.session == "2021s1" else self.session

            version_url_base = (
                "http://alisondb.legislature.state.al.us/ALISON/"
                "SearchableInstruments/{0}/PrintFiles/{1}-".format(
                    session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + "int.pdf"
                elif version == "Engrossed":
                    version_url = version_url_base + "eng.pdf"
                elif version == "Enrolled":
                    version_url = version_url_base + "enr.pdf"
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath("td[1]")[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath("td[2]/font/text()")[0], self.DATE_FORMAT)
                bir_type = bir.xpath("td[1]/font/text()")[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type,
                    bir.xpath("td[3]/font/text()")[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification="other",
                )

                try:
                    (bir_vote_id, ) = bir.xpath("td[4]/font/input/@value")
                except ValueError:
                    bir_vote_id = ""

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text,
                    )

            actions = bill_doc.xpath(
                '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath("td[1]/font/text()")[0].encode(
                        "ascii", "ignore").strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath("td[1]/font/text()")[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath("td[2]/font/text()")

                possible_amendment = action.xpath("td[3]/font/u/text()")
                if (len(possible_amendment) > 0
                        and not possible_amendment[0].strip() == ""):
                    (amendment, ) = possible_amendment
                else:
                    amendment = None

                (action_text, ) = action.xpath("td[4]/font/text()")

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = (re.search(
                        r".*? referred to the .*? committee on (.*?)$",
                        action_text).group(1).strip())
                except AttributeError:
                    action_committee = ""

                if action_date is not None and action_text.strip():
                    act = bill.add_action(
                        action_text,
                        TIMEZONE.localize(action_date),
                        chamber=actor,
                        classification=action_type,
                    )
                    if action_committee:
                        act.add_related_entity(action_committee,
                                               entity_type="organization")

                    try:
                        vote_button = action.xpath("td[9]//text()")[0].strip()
                    except IndexError:
                        vote_button = ""

                    if vote_button.startswith("Roll "):
                        vote_id = vote_button.split(" ")[-1]

                        yield from self.scrape_vote(
                            bill=bill,
                            vote_chamber=action_chamber,
                            bill_id=bill_id,
                            vote_id=vote_id,
                            vote_date=TIMEZONE.localize(action_date),
                            action_text=action_text,
                        )

                if amendment:
                    session = "2021FS" if self.session == "2021s1" else self.session
                    amend_url = (
                        "http://alisondb.legislature.state.al.us/ALISON/"
                        "SearchableInstruments/{0}/PrintFiles/{1}.pdf".format(
                            session, amendment))

                    amend_name = "Amd/Sub {}".format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type="application/pdf",
                        on_duplicate="ignore",
                    )

            yield bill
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if "INVALID BILL NUMBER" in page:
            self.warning("INVALID BILL %s" % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath("span/text()")[0]

        if "General Bill" in bill_type:
            bill_type = "bill"
        elif "Concurrent Resolution" in bill_type:
            bill_type = "concurrent resolution"
        elif "Joint Resolution" in bill_type:
            bill_type = "joint resolution"
        elif "Resolution" in bill_type:
            bill_type = "resolution"
        else:
            raise ValueError("unknown bill type: %s" % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=
            session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type,
        )

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                primary=True,
                entity_type="person",
            )
        for sponsor in doc.xpath(
                '//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u"\xa0", " ").strip()
            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                primary=True,
                entity_type="organization",
            )

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.
                text,  # Description of the version from the state;
                #  eg, 'As introduced', 'Amended', etc.
                url=version.get("href"),
                on_duplicate="ignore",
                media_type="text/html",  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath("table/tr"):
            date_td, chamber_td, action_td = row.xpath("td")

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {
                "Senate": "upper",
                "House": "lower",
                None: "legislature"
            }[chamber_td.text]

            action = action_td.text_content()
            action = action.split("(House Journal")[0]
            action = action.split("(Senate Journal")[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime("%Y-%m-%d"),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype,  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Beispiel #9
0
    def scrape(self, session=None):
        if not session:
            session = self.jurisdiction.legislative_sessions[-1]["identifier"]
            self.info("no session specified, using %s", session)
        chamber_types = {
            "H": "lower",
            "S": "upper",
            "G": "executive",
            "C": "legislature",
        }
        session_id = SESSION_SITE_IDS[session]
        self._url_base += session_id + "/"
        bill_url_base = "https://lis.virginia.gov/cgi-bin/"

        self.load_members()
        self.load_sponsors()
        self.load_amendments()
        self.load_history()
        self.load_summaries()
        self.load_votes()
        self.load_bills()

        for bill in self._bills:
            bill = self._bills[bill][0]

            bill_id = bill["bill_id"]
            chamber = chamber_types[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            b = Bill(
                bill_id,
                session,
                bill["bill_description"],
                chamber=chamber,
                classification=bill_type,
            )
            bill_url = bill_url_base + f"legp604.exe?{session_id}+sum+{bill_id}"
            b.add_source(bill_url)

            # Long Bill ID needs to have 6 characters to work with vote urls, sponsors, and summaries.
            # Fill in blanks with 0s
            long_bill_id = bill_id
            if len(bill_id) == 3:
                long_bill_id = bill_id[0:2] + "000" + bill_id[-1]
            elif len(bill_id) == 4:
                long_bill_id = bill_id[0:2] + "00" + bill_id[-2:]
            elif len(bill_id) == 5:
                long_bill_id = bill_id[0:2] + "0" + bill_id[-3:]

            # Sponsors
            for spon in self._sponsors[long_bill_id]:
                sponsor_type = spon["patron_type"]
                if sponsor_type.endswith("Chief Patron"):
                    sponsor_type = "primary"
                else:
                    sponsor_type = "cosponsor"
                b.add_sponsorship(
                    spon["member_name"],
                    classification=sponsor_type,
                    entity_type="person",
                    primary=sponsor_type == "primary",
                )

            # Summary
            summary_texts = self._summaries[long_bill_id]
            for sum_text in summary_texts:
                b.add_abstract(sum_text["summary_text"],
                               sum_text["summary_type"])

            # Amendment docs
            amendments = self._amendments[bill_id]
            for amend in amendments:
                doc_link = (
                    bill_url_base +
                    f"legp604.exe?{session_id}+amd+{amend['txt_docid']}")
                b.add_document_link("Amendment: " + amend["txt_docid"],
                                    doc_link,
                                    media_type="text/html")

            # Action text is used to improve version text
            actions_text = []
            # History and then votes
            for hist in self._history[bill_id]:
                action = hist["history_description"]
                action_date = hist["history_date"]
                date = datetime.datetime.strptime(action_date,
                                                  "%m/%d/%y").date()
                chamber = chamber_types[action[0]]
                vote_id = hist["history_refid"]
                cleaned_action = action[2:]
                actions_text.append(cleaned_action)

                # categorize actions
                for pattern, atype in ACTION_CLASSIFIERS:
                    if re.match(pattern, cleaned_action):
                        break
                else:
                    atype = None

                if atype != SKIP:
                    b.add_action(cleaned_action,
                                 date,
                                 chamber=chamber,
                                 classification=atype)

                if len(vote_id) > 0:
                    total_yes = 0
                    total_no = 0
                    total_not_voting = 0
                    total_abstain = 0
                    for v in self._votes[vote_id]:
                        if v["vote_result"] == "yes":
                            total_yes += 1
                        elif v["vote_result"] == "no":
                            total_no += 1
                        elif v["vote_result"] == "not voting":
                            total_not_voting += 1
                        elif v["vote_result"] == "abstain":
                            total_abstain += 1
                    vote = VoteEvent(
                        identifier=vote_id,
                        start_date=date,
                        chamber=chamber,
                        motion_text=cleaned_action,
                        result="pass" if total_yes > total_no else "fail",
                        classification="passage",
                        bill=b,
                    )
                    vote.set_count("yes", total_yes)
                    vote.set_count("no", total_no)
                    vote.set_count("not voting", total_not_voting)
                    vote.set_count("abstain", total_abstain)

                    vote_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+vot+{vote_id}+{long_bill_id}"
                    )
                    vote.add_source(vote_url)
                    for v in self._votes[vote_id]:
                        vote.vote(v["vote_result"], v["member_id"])
                    yield vote

            # Versions
            for version in bill["text_docs"]:
                # Checks if abbr is blank as not every bill has multiple versions
                if len(version["doc_abbr"]) > 0:
                    version_url = (
                        bill_url_base +
                        f"legp604.exe?{session_id}+ful+{version['doc_abbr']}")
                    version_date = datetime.datetime.strptime(
                        version["doc_date"], "%m/%d/%y").date()
                    version_text = version["doc_abbr"]
                    for act in actions_text:
                        if version_text in act:
                            version_text = act
                    b.add_version_link(
                        version_text,
                        version_url,
                        date=version_date,
                        media_type="text/html",
                        on_duplicate="ignore",
                    )

            yield b
Beispiel #10
0
def test_set_bill_obj():
    ve = toy_vote_event()
    b = Bill("HB 1", legislative_session="2009", title="fake bill")
    ve.set_bill(b)
    assert ve.bill == b._id
    def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation):
        page = self.lxmlize(bill_url)
        author = self.get_one_xpath(page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()")

        def sbp(x):
            return self.scrape_bare_page(
                page.xpath("//a[contains(text(), '%s')]" % (x))[0].attrib["href"]
            )

        authors = [x.text for x in sbp("Authors")]

        try:
            digests = sbp("Digests")
        except IndexError:
            digests = []

        try:
            versions = sbp("Text")
        except IndexError:
            versions = []

        try:
            amendments = sbp("Amendments")
        except IndexError:
            amendments = []

        title = page.xpath("//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0]
        title = title.replace("\u00a0\u00a0", " ")
        actions = page.xpath(
            "//div[@id='ctl00_PageBody_PanelBillInfo']/"
            "/table[@style='font-size:small']/tr"
        )

        bill_id = page.xpath("//span[@id='ctl00_PageBody_LabelBillID']/text()")[0]

        bill_type = self._bill_types[bill_abbreviation[1:]]
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(bill_url)

        authors.remove(author)
        bill.add_sponsorship(
            author, classification="primary", entity_type="person", primary=True
        )
        for author in authors:
            bill.add_sponsorship(
                author, classification="cosponsor", entity_type="person", primary=False
            )

        for digest in digests:
            bill.add_document_link(
                note=digest.text,
                url=digest.attrib["href"],
                media_type="application/pdf",
            )

        for version in versions:
            bill.add_version_link(
                note=version.text,
                url=version.attrib["href"],
                media_type="application/pdf",
            )

        for amendment in amendments:
            if "href" in amendment.attrib:
                bill.add_version_link(
                    note=amendment.text,
                    url=amendment.attrib["href"],
                    media_type="application/pdf",
                )

        flags = {
            "prefiled": ["filing"],
            "referred to the committee": ["referral-committee"],
            "sent to the house": ["passage"],
            "ordered returned to the house": ["passage"],
            "ordered to the senate": ["passage"],
            "signed by the governor": ["executive-signature"],
            "sent to the governor": ["executive-receipt"],
            "becomes Act": ["became-law"],
            "vetoed by the governor": ["executive-veto"],
        }

        try:
            votes_link = page.xpath("//a[text() = 'Votes']")[0]
            yield from self.scrape_votes(bill, votes_link.attrib["href"])
        except IndexError:
            # Some bills don't have any votes
            pass

        for action in actions:
            date, chamber, page, text = [x.text for x in action.xpath(".//td")]
            session_year = self.jurisdiction.legislative_sessions[-1]["start_date"][0:4]
            # Session is April -> June. Prefiles look like they're in
            # January at earliest.
            date += "/{}".format(session_year)
            date = dt.datetime.strptime(date, "%m/%d/%Y")
            chamber = self._chambers[chamber]

            cat = []
            for flag in flags:
                if flag in text.lower():
                    cat += flags[flag]

            bill.add_action(
                description=text,
                date=date.strftime("%Y-%m-%d"),
                chamber=chamber,
                classification=cat,
            )

        yield bill
Beispiel #12
0
def test_set_bill_obj_no_extra_args():
    ve = toy_vote_event()
    b = Bill("HB 1", legislative_session="2009", title="fake bill")
    with pytest.raises(ValueError):
        ve.set_bill(b, chamber="lower")
Beispiel #13
0
    def scrape(self, session=None):
        self._bill_prefix_map = {
            "HB": {"type": "bill", "url_segment": "bills/house"},
            "HR": {"type": "resolution", "url_segment": "resolutions/house/simple"},
            "HCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "HC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "SB": {"type": "bill", "url_segment": "bills/senate"},
            "SR": {"type": "resolution", "url_segment": "resolutions/senate/simple"},
            "SCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
            "SC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
        }

        api_base_url = "https://api.iga.in.gov"

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # we've put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            disp_bill_id = b["displayName"]

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Bill could not be accessed. Skipping.")
                continue

            title = bill_json["description"]
            if title == "NoneNone":
                title = None
            # sometimes description is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = (
                "lower" if bill_json["originChamber"].lower() == "house" else "upper"
            )
            bill_type = self._bill_prefix_map[bill_prefix]["type"]
            bill = Bill(
                disp_bill_id,
                legislative_session=session,
                chamber=original_chamber,
                title=title,
                classification=bill_type,
            )

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="author")
            for s in bill_json["coauthors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="coauthor")
            for s in bill_json["sponsors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="sponsor")
            for s in bill_json["cosponsors"]:
                self._add_sponsor_if_not_blank(bill, s, classification="cosponsor")

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get(
                    "bill_actions", session=session, bill_id=bill_id.lower()
                )
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace("T", " ")
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if "second reading" in d or "reread second time" in d:
                    action_type.append("reading-2")
                    reading = True

                if "third reading" in d or "reread third time" in d:
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if (
                    "referred" in d
                    and "committee on" in d
                    or "reassigned" in d
                    and "committee on" in d
                ):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if "vetoed by the governor" in d:
                    action_type.append("executive-veto")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning(
                        "Could not recognize an action in '{}'".format(action_desc)
                    )
                    action_type = None

                a = bill.add_action(
                    chamber=action_chamber,
                    description=action_desc,
                    date=date,
                    classification=action_type,
                )
                if committee:
                    a.add_related_entity(committee, entity_type="organization")

            # subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            for subject in subjects:
                bill.add_subject(subject)

            # Abstract
            if bill_json["latestVersion"]["digest"]:
                bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest")

            # put this behind a flag 2021-03-18 (openstates/issues#291)
            if not SCRAPE_WEB_VERSIONS:
                # votes
                yield from self._process_votes(
                    bill_json["latestVersion"]["rollcalls"],
                    disp_bill_id,
                    original_chamber,
                    session,
                )
                # versions
                self.deal_with_version(
                    bill_json["latestVersion"], bill, bill_id, original_chamber, session
                )
                for version in bill_json["versions"][::-1]:
                    self.deal_with_version(
                        version,
                        bill,
                        bill_id,
                        original_chamber,
                        session,
                    )
            else:
                self.scrape_web_versions(session, bill, bill_id)

            yield bill
Beispiel #14
0
    def old_scrape(self, session=None):
        status_report_url = (
            "https://www.legislature.ohio.gov/legislation/status-reports")

        # ssl verification off due Ohio not correctly implementing SSL
        if not session:
            session = self.latest_session()
            self.info("no session, using %s", session)

        doc = self.get(status_report_url).text
        doc = lxml.html.fromstring(doc)
        doc.make_links_absolute(status_report_url)
        xpath = "//div[contains(text(),'{}')]/following-sibling::table"
        status_table = doc.xpath(xpath.format(session))[0]
        status_links = status_table.xpath(
            ".//a[contains(text(),'Excel')]/@href")

        for url in status_links:

            try:
                fname, resp = self.urlretrieve(url)
            except scrapelib.HTTPError as report:
                self.logger.warning("Missing report {}".format(report))
                continue

            sh = xlrd.open_workbook(fname).sheet_by_index(0)

            # once workbook is open, we can remove tempfile
            os.remove(fname)
            for rownum in range(1, sh.nrows):
                bill_id = sh.cell(rownum, 0).value

                bill_type = "resolution" if "R" in bill_id else "bill"
                chamber = "lower" if "H" in bill_id else "upper"

                bill_title = str(sh.cell(rownum, 3).value)

                bill = Bill(
                    bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification=bill_type,
                )
                bill.add_source(url)
                bill.add_sponsor("primary", str(sh.cell(rownum, 1).value))

                # add cosponsor
                if sh.cell(rownum, 2).value:
                    bill.add_sponsor("cosponsor",
                                     str(sh.cell(rownum, 2).value))

                actor = ""

                # Actions start column after bill title
                for colnum in range(4, sh.ncols - 1):
                    action = str(sh.cell(0, colnum).value)
                    cell = sh.cell(rownum, colnum)
                    date = cell.value

                    if len(action) != 0:
                        if action.split()[0] == "House":
                            actor = "lower"
                        elif action.split()[0] == "Senate":
                            actor = "upper"
                        elif action.split()[-1] == "Governor":
                            actor = "executive"
                        elif action.split()[0] == "Gov.":
                            actor = "executive"
                        elif action.split()[-1] == "Gov.":
                            actor = "executive"

                    if action in ("House Intro. Date", "Senate Intro. Date"):
                        atype = ["bill:introduced"]
                        action = action.replace("Intro. Date", "Introduced")
                    elif action == "3rd Consideration":
                        atype = ["bill:reading:3", "bill:passed"]
                    elif action == "Sent to Gov.":
                        atype = ["governor:received"]
                    elif action == "Signed By Governor":
                        atype = ["governor:signed"]
                    else:
                        atype = ["other"]

                    if type(date) == float:
                        date = str(xlrd.xldate_as_tuple(date, 0))
                        date = datetime.datetime.strptime(
                            date, "(%Y, %m, %d, %H, %M, %S)")
                        date = self._tz.localize(date)
                        date = "{:%Y-%m-%d}".format(date)
                        bill.add_action(actor, action, date, type=atype)

                for idx, char in enumerate(bill_id):
                    try:
                        int(char)
                    except ValueError:
                        continue

                    underscore_bill = bill_id[:idx] + "_" + bill_id[idx:]
                    break

                yield from self.scrape_votes_old(bill, underscore_bill,
                                                 session)
                self.scrape_versions_old(bill, underscore_bill, session)
                yield bill
Beispiel #15
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)
        page.make_links_absolute(self.BASE_URL)

        if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[
                0
            ].strip()
        elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip()
        else:
            self.warning("No bill id for {}".format(url))
            return
        title = page.xpath(
            '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()'
        )[0].strip()

        if "B" in bill_id:
            _type = ["bill"]
        elif "J" in bill_id:
            _type = ["joint resolution"]
        elif "HS" in bill_id or "SS" in bill_id:
            _type = ["resolution"]
        else:
            raise ValueError("unknown bill type " + bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=_type,
        )
        bill.add_source(url)

        self.scrape_bill_subjects(bill, page)
        self.scrape_bill_sponsors(bill, page)
        self.scrape_bill_actions(bill, page)

        # fiscal note
        if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'):
            fiscal_note = page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
            )[0]
            fiscal_url = fiscal_note.get("href")
            fiscal_title = fiscal_note.text_content()
            bill.add_document_link(
                fiscal_title, fiscal_url, media_type="application/pdf"
            )

        # effective date, where available
        if page.xpath('//div[contains(text(), "Effective Date(s)")]'):
            eff_date = page.xpath(
                '//div[contains(text(), "Effective Date(s)")]/text()'
            )[0].strip()
            eff_date = eff_date.replace("Effective Date(s):", "").strip()
            # this can contain multiple dates, eg "July 1, 2020, July 1, 2022"
            bill.extras["date_effective"] = eff_date

        # yield from self.parse_bill_votes_new(doc, bill)
        yield bill
    def scrape_bill(
        self,
        session,
        chamber,
        bill_id,
        title,
        url,
        strip_sponsors=re.compile(r"\s*\(.{,50}\)\s*").sub,
    ):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)

        xpath = '//strong[contains(., "SUBJECT")]/../' "following-sibling::td/a/text()"
        bill.subject = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version_link(**version)

        self.scrape_amendments(page, bill)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath("td/strong/text()")
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, "").strip()
            values[heading] = value

        # summary was always same as title
        # bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors("", values.get("LEAD SPONSOR:", ""))
        if primary:
            bill.add_sponsorship(
                name=primary,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        # Add cosponsors.
        if values.get("SPONSORS:"):
            sponsors = strip_sponsors("", values["SPONSORS:"])
            sponsors = re.split(r", (?![A-Z]\.)", sponsors)
            for name in sponsors:
                name = name.strip(", \n\r")
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search(r"(.+?), ([DM]\. Hall)", name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsorship(
                                name=name,
                                classification="cosponsor",
                                entity_type="person",
                                primary=False,
                            )
                    else:
                        bill.add_sponsorship(
                            name=name,
                            classification="cosponsor",
                            entity_type="person",
                            primary=False,
                        )

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            yield from self.scrape_house_vote(bill, link.attrib["href"])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath("td")
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {"S": "upper", "H": "lower"}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith("passed senate"):
                for href in tds[1].xpath("a/@href"):
                    yield from self.scrape_senate_vote(bill, href, date)

            attrs = dict(chamber=chamber,
                         description=action,
                         date=date.strftime("%Y-%m-%d"))
            temp = self.categorizer.categorize(action)
            related_entities = []
            for key, values in temp.items():
                if key != "classification":
                    for value in values:
                        related_entities.append({"type": key, "name": value})
            attrs.update(classification=temp["classification"],
                         related_entities=related_entities)
            bill.add_action(**attrs)

        yield bill
    def scrape(self, session=None):
        HTML_TAGS_RE = r"<.*?>"

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)["data"] or []

        bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)["data"] or [])

        resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format(
            year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)["data"] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info["BillNumber"].startswith("J.R.H."):
                bill_type = "joint resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("J.R.S."):
                bill_type = "joint resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.R."):
                bill_type = "resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.R."):
                bill_type = "resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("PR."):
                bill_type = "constitutional amendment"
                if info["Body"] == "H":
                    bill_chamber = "lower"
                elif info["Body"] == "S":
                    bill_chamber = "upper"
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info["BillNumber"].startswith("H."):
                bill_type = "bill"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S."):
                bill_type = "bill"
                bill_chamber = "upper"

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info["BillNumber"]))

            bill_id = info["BillNumber"].replace(".", "").replace(" ", "")
            # put one space back in between type and number
            bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info["Title"],
                classification=bill_type,
            )
            if "resolution" in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format(
                year_slug, info["BillNumber"])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                "following-sibling::dd[1]/ul/li")
            sponsor_type = "primary"
            for sponsor in sponsors:
                if sponsor.xpath("span/text()") == ["Additional Sponsors"]:
                    sponsor_type = "cosponsor"
                    continue

                sponsor_name = (sponsor.xpath("a/text()")[0].replace(
                    "Rep.", "").replace("Sen.", "").strip())
                if sponsor_name and not (sponsor_name[:5] == "Less"
                                         and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type="person",
                        primary=(sponsor_type == "primary"),
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                "following-sibling::dd[1]/ul/li/a |"
                '//ul[@class="bill-path"]//a')

            for version in versions:
                if version.xpath("text()"):
                    bill.add_version_link(
                        note=version.xpath("text()")[0],
                        url=version.xpath("@href")[0].replace(" ", "%20"),
                        media_type="application/pdf",
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode("utf-8"),
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(
                    info["BillNumber"]))
                yield bill
                continue

            # Capture actions
            actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format(
                year_slug, internal_bill_id)
            actions_json = self.get(actions_url)

            # Checks if page actually has json posted
            if "json" in actions_json.headers.get("Content-Type"):
                actions = json.loads(actions_json.text)["data"]
                # Checks to see if any data is actually there
                if actions == "":
                    continue
            else:
                continue
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action["FullStatus"]:
                    actor = "executive"
                elif action["ChamberCode"] == "H":
                    actor = "lower"
                elif action["ChamberCode"] == "S":
                    actor = "upper"
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action["FullStatus"]:
                    # assert chambers_passed == set("HS")
                    action_type = "executive-signature"
                elif "Vetoed by the Governor" in action["FullStatus"]:
                    action_type = "executive-veto"
                elif ("Read first time" in action["FullStatus"]
                      or "Read 1st time" in action["FullStatus"]):
                    action_type = "introduction"
                elif "Reported favorably" in action["FullStatus"]:
                    action_type = "committee-passage-favorable"
                elif actor == "lower" and any(
                        x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("H")
                elif actor == "upper" and any(
                        x.lower().startswith(" aspassed")
                        or x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("S")
                else:
                    action_type = None

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.511
                action["StatusDate"] = action["StatusDate"].replace(
                    "/0209", "/2019")

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action["StatusDate"],
                                                   "%m/%d/%Y"),
                        "%Y-%m-%d",
                    ),
                    chamber=actor,
                    classification=action_type,
                )

            # Capture votes
            votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)["data"]
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote["VoteHeaderID"]
                roll_call_url = ("http://legislature.vermont.gov/bill/"
                                 "loadBillRollCallDetails/{0}/{1}".format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)["data"]

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name,
                     _district) = member["MemberName"].split(" of ")
                    member_name = member_name.strip()

                    if member["MemberVote"] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member["MemberVote"] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote["FullStatus"]
                        # seems like we've seen both
                        or "Governor overridden" in vote["FullStatus"] or
                        "Governor overriden" in vote["FullStatus"]):
                    did_pass = True
                elif ("Failed -- " in vote["FullStatus"] or
                      "Veto of the Governor sustained" in vote["FullStatus"]):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear: " +
                                         vote["FullStatus"])

                # Check vote counts
                yea_count = int(
                    re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1))
                nay_count = int(
                    re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1))

                vote_start_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                motion_text = re.sub(HTML_TAGS_RE, "",
                                     vote["FullStatus"]).strip()
                vote_identifer = (vote["StatusDate"] + "--" + motion_text +
                                  "--" + roll_call_url)
                vote_to_add = VoteEvent(
                    identifier=vote_identifer,
                    bill=bill,
                    chamber=("lower"
                             if vote["ChamberCode"] == "H" else "upper"),
                    start_date=vote_start_date,
                    motion_text=motion_text,
                    result="pass" if did_pass else "fail",
                    classification="passage",
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count("yes", yea_count)
                vote_to_add.set_count("no", nay_count)
                vote_to_add.set_count("not voting", len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote("not voting", member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Beispiel #18
0
    def scrape_chamber(self, chamber, session):
        chamber_letter = "S" if chamber == "upper" else "H"
        bill_type_map = {
            "B": "bill",
            "CR": "concurrent resolution",
            "JM": "joint memorial",
            "JR": "joint resolution",
            "M": "memorial",
            "R": "resolution",
        }

        # used for faking sources
        session_year = session[2:]

        self._init_mdb(session)

        # read in sponsor & subject mappings
        sponsor_map = {}
        for sponsor in self.access_to_csv("tblSponsors"):
            sponsor_map[sponsor["SponsorCode"]] = sponsor["FullName"]

        # McSorley resigned so they removed him from the API
        # but he is still attached to some bills
        # Gonzales switched from being in the House to the Senate
        # but was still showing as a sponsor
        sponsor_map["SMCSO"] = "Cisco McSorley"
        sponsor_map["SGONZ"] = "Roberto J. Gonzales"

        subject_map = {}
        for subject in self.access_to_csv("TblSubjects"):
            subject_map[subject["SubjectCode"]] = subject["Subject"]

        # get all bills into this dict, fill in action/docs before saving
        bills = {}
        for data in [
                row for row in self.access_to_csv("Legislation")
                if row["BillID"].startswith(chamber_letter)
        ]:
            # use their BillID for the key but build our own for storage
            bill_key = data["BillID"].replace(" ", "")

            # remove spaces for consistency
            bill_id = "{}{}{}".format(data["Chamber"], data["LegType"],
                                      data["LegNo"]).replace(" ", "")
            bill_type = bill_type_map[data["LegType"]]
            bills[bill_key] = bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=data["Title"],
                classification=bill_type,
            )

            # fake a source
            data["SessionYear"] = session_year
            data.update({
                x: data[x].strip()
                for x in ["Chamber", "LegType", "LegNo", "SessionYear"]
            })

            bill.add_source(
                "http://www.nmlegis.gov/Legislation/Legislation?chamber="
                "{Chamber}&legType={LegType}&legNo={LegNo}"
                "&year={SessionYear}".format(**data))
            bill.add_sponsorship(
                sponsor_map[data["SponsorCode"]],
                classification="primary",
                entity_type="person",
                primary=True,
            )
            for sponsor_code in [
                    "SponsorCode2",
                    "SponsorCode3",
                    "SponsorCode4",
                    "SponsorCode5",
            ]:
                if data[sponsor_code] and data[sponsor_code] not in ("NONE",
                                                                     "X", ""):
                    bill.add_sponsorship(
                        sponsor_map[data[sponsor_code]],
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

            # maybe use data['emergency'] data['passed'] data['signed'] as well
            for subject_code in [
                    "SubjectCode1", "SubjectCode2", "SubjectCode3"
            ]:
                if data[subject_code]:
                    bill.add_subject(subject_map[data[subject_code]])

        # bills and actions come from other tables
        self.scrape_actions(chamber_letter, bills)
        self.scrape_documents(session, "bills", chamber, bills)
        self.scrape_documents(session, "resolutions", chamber, bills)
        self.scrape_documents(session, "memorials", chamber, bills)
        self.check_other_documents(session, chamber, bills)

        yield from bills.values()
Beispiel #19
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
            session[:4],
            bill_id.replace(" ", "-"),
        )
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if (
            "Page Not Found" in html
            or "The bill you are looking for is not available yet" in html
        ):
            url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
                session[-4:],
                bill_id.replace(" ", "-"),
            )
            html = self.get(url).text
            if (
                "Page Not Found" in html
                or "The bill you are looking for is not available yet" in html
            ):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute("http://legislature.mi.gov")

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[
            0
        ].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(" ")[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u"\xa0", " ")
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    "primary"
                    if sponsor.tail and "primary" in sponsor.tail
                    else "cosponsor"
                )
            else:
                classification = "primary"
            bill.add_sponsorship(
                name=name.strip(),
                chamber=chamber,
                entity_type="person",
                primary=classification == "primary",
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath("td")  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            try:
                date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y"))
            except ValueError:
                self.warning(
                    "{} has action with invalid date. Skipping Action".format(bill_id)
                )
                continue
            # instead of trusting upper/lower case, use journal for actor
            actor = "upper" if "SJ" in journal else "lower"
            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(
                r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE
            )
            if submatch and tds[2].xpath("a"):
                version_url = tds[2].xpath("a/@href")[0]
                version_name = tds[2].xpath("a/text()")[0].strip()
                version_name = "Substitute {}".format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith(".pdf"):
                    mimetype = "application/pdf"
                elif version_url.lower().endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath("a/@href")
                if journal_link:
                    objectname = journal_link[0].rsplit("=", 1)[-1]
                    chamber_name = {"upper": "Senate", "lower": "House"}[actor]
                    vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % (
                        session,
                        chamber_name,
                        objectname,
                    )
                    results = self.parse_roll_call(vote_url, rc_num, session)

                    if results is not None:
                        vote_passed = len(results["yes"]) > len(results["no"])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result="pass" if vote_passed else "fail",
                            classification="passage",
                        )

                        # check the expected counts vs actual
                        count = re.search(r"YEAS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["yes"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d"
                                % (bill_id, action, count, len(results["yes"]))
                            )
                        count = re.search(r"NAYS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["no"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d"
                                % (bill_id, action, count, len(results["no"]))
                            )

                        vote.set_count("yes", len(results["yes"]))
                        vote.set_count("no", len(results["no"]))
                        vote.set_count("other", len(results["other"]))
                        possible_vote_results = ["yes", "no", "other"]
                        for pvr in possible_vote_results:
                            for name in results[pvr]:
                                if session == "2017-2018":
                                    names = name.split("\t")
                                    for n in names:
                                        vote.vote(pvr, name.strip())
                                else:
                                    # Prevents voter names like "House Bill No. 4451, entitled" and other sentences
                                    if len(name.split()) < 5:
                                        vote.vote(pvr, name.strip())
                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" % (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith(".pdf"):
                    mimetype = "application/pdf"
                elif url.endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Beispiel #20
0
    def scrape_bill(self, row, session):
        bill_id = row["LegislationDisplayCode"]

        amendment = None
        substitute = None

        if bill_id.count(" ") > 1:
            if " w/ " in bill_id:
                self.info("Found amended bill `{}`".format(bill_id))
                bill_id, amendment = bill_id.split(" w/ ")
            if " -" in bill_id:
                self.info("Found amended bill `{}`".format(bill_id))
                bill_id, amendment = bill_id.split(" -")
            # A bill can _both_ be amended and be substituted
            if " for " in bill_id:
                self.info(
                    "Found substitute to use instead: `{}`".format(bill_id))
                substitute, bill_id = bill_id.split(" for ")
            if amendment is None and substitute is None:
                raise ValueError("unknown bill_id format: " + bill_id)

        bill_type = self.classify_bill(bill_id)
        chamber = "upper" if bill_id.startswith("S") else "lower"

        bill = Bill(
            identifier=bill_id,
            legislative_session=session,
            chamber=chamber,
            title=row["LongTitle"],
            classification=bill_type,
        )
        if row["Synopsis"]:
            bill.add_abstract(row["Synopsis"], "synopsis")
        if row["ShortTitle"]:
            bill.add_title(row["ShortTitle"], "short title")
        if row["SponsorPersonId"]:
            self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"],
                                              "primary")
        if substitute:
            bill.extras["substitute"] = substitute
        if amendment:
            bill.extras["amendment"] = amendment

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format(
            row["LegislationId"])
        bill.add_source(html_url, note="text/html")

        html = self.lxmlize(html_url)

        additional_sponsors = html.xpath(
            '//label[text()="Additional Sponsor(s):"]'
            "/following-sibling::div/a/@href")
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?"
                "personId=", "")
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary")

        cosponsors = html.xpath('//label[text()="Co-Sponsor(s):"]/'
                                "following-sibling::div/a/@href")
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?"
                "personId=", "")
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor")

        versions = html.xpath(
            '//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = "Bill Text"
            bill.add_version_link(version_name,
                                  version_url,
                                  media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row["LegislationId"])

        if row["HasAmendments"] is True:
            self.scrape_amendments(bill, row["LegislationId"])

        yield from self.scrape_votes(bill, row["LegislationId"], session)

        yield bill
Beispiel #21
0
    def scrape_chamber(self, chamber, session):

        # Pull the session metadata so we can get the
        # slug for the API Request
        meta = next(each for each in self.jurisdiction.legislative_sessions
                    if each["identifier"] == session)
        if meta["classification"] == "special":
            list_slug = self.special_slugs[session]
        else:
            list_slug = "li"

        list_url = "http://www.kslegislature.org/{}" "/api/v11/rev-1/bill_status"
        list_url = list_url.format(list_slug)

        chamber_name = "Senate" if chamber == "upper" else "House"
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(list_url).text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json["content"]

        # there are duplicates
        seen_ids = set()

        for bill_data in bills:

            bill_id = bill_data["BILLNO"]

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue
            # filter duplicates
            if bill_id in seen_ids:
                continue

            seen_ids.add(bill_id)

            if "CR" in bill_id:
                btype = "concurrent resolution"
            elif "R" in bill_id:
                btype = "resolution"
            elif "B" in bill_id:
                btype = "bill"

            title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"]

            # main
            bill = Bill(bill_id,
                        session,
                        title,
                        chamber=chamber,
                        classification=btype)
            bill.extras = {"status": bill_data["STATUS"]}

            bill.add_source(ksapi.url + "bill_status/" + bill_id.lower())

            if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill.title:
                bill.add_title(bill_data["LONGTITLE"])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data["ORIGINAL_SPONSOR"]:
                primary_sponsor = self.clean_sponsor_name(primary_sponsor)
                bill.add_sponsorship(
                    name=primary_sponsor,
                    entity_type="organization"
                    if "committee" in primary_sponsor.lower() else "person",
                    primary=True,
                    classification="original sponsor",
                )
            for sponsor in bill_data["SPONSOR_NAMES"]:
                if sponsor in bill_data["ORIGINAL_SPONSOR"]:
                    continue
                sponsor = self.clean_sponsor_name(sponsor)
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="organization"
                    if "committee" in sponsor.lower() else "person",
                    primary=False,
                    classification="cosponsor",
                )

            # history is backwards
            for event in reversed(bill_data["HISTORY"]):
                actor = "upper" if event["chamber"] == "Senate" else "lower"

                date = event["session_date"]
                # append committee names if present
                if "committee_names" in event:
                    action = (event["status"] + " " +
                              " and ".join(event["committee_names"]))
                else:
                    action = event["status"]

                if event["action_code"] not in ksapi.action_codes:
                    self.warning(
                        "unknown action code on %s: %s %s" %
                        (bill_id, event["action_code"], event["status"]))
                    atype = None
                else:
                    atype = ksapi.action_codes[event["action_code"]]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            yield from self.scrape_html(bill, session)

            yield bill
Beispiel #22
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        hist_url = (
            f"https://www.legis.iowa.gov/legislation/billTracking/"
            f"billHistory?billName={bill_id}&ga={session_id}"
        )
        req_session = requests.Session()
        req = requests.get(hist_url)
        if req.status_code == 500:
            self.warning("500 error on {}, skipping".format(hist_url))
            return

        page = lxml.html.fromstring(req.text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath(
            'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])'
        ).strip()

        if title == "":
            # Sometimes the title is moved, see
            # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88
            title = page.xpath(
                'string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])'
            ).strip()
            if title == "":
                self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url)
                return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if "HR" in bill_id or "SR" in bill_id:
            bill_type = ["resolution"]
        elif "HJR" in bill_id or "SJR" in bill_id:
            bill_type = ["joint resolution"]
        elif "HCR" in bill_id or "SCR" in bill_id:
            bill_type = ["concurrent resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = (
            "https://www.legis.iowa.gov/docs/"
            "publications/LG{}/{}/attachments/{}.html"
        )
        version_pdf_url_template = (
            "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf"
        )

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath("string(@value)")

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id, bill_id.replace(" ", "")
                )

                bill.add_version_link(
                    note=version_name, url=version_html_url, media_type="text/html"
                )

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id, bill_id.replace(" ", "")
                )

                if "Marked Up" in version_name:
                    version_pdf_url = sidebar.xpath(
                        "//iframe[@id='bbContextDoc']/@src"
                    )[0]

                bill.add_version_link(
                    note=version_name, url=version_pdf_url, media_type="application/pdf"
                )

        sponsors_str = page.xpath(
            'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])'
        ).strip()

        if re.search("^By ", sponsors_str):
            sponsors = re.split(",| and ", sponsors_str.split("By ")[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall(
                r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str
            )

        for sponsor in sponsors:
            sponsor = sponsor.replace(" and", "").strip(" .,")

            # a few sponsors get mangled by our regex
            sponsor = {
                "Means": "Ways & Means",
                "Iowa": "Economic Growth/Rebuild Iowa",
                "Safety": "Public Safety",
                "Resources": "Human Resources",
                "Affairs": "Veterans Affairs",
                "Protection": "Environmental Protection",
                "Government": "State Government",
                "Boef": "De Boef",
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        for tr in page.xpath(
            "//table[contains(@class, 'billActionTable')][1]/tbody/tr"
        ):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[3])").strip()
            action = re.sub(r"\s+", " ", action)

            # Capture any amendment links.
            links = [link for link in [version["links"] for version in bill.versions]]
            version_urls = [link["url"] for link in [i for sub in links for i in sub]]
            if "amendment" in action.lower():
                for anchor in tr.xpath(".//a[1]"):
                    if "-" in anchor.text:
                        # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf
                        amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf"
                        amd_id = anchor.text.replace("-", "").strip()
                        amd_url = amd_pattern.format(session_id, amd_id)
                        amd_name = "Amendment {}".format(anchor.text.strip())

                        if amd_url not in version_urls:
                            bill.add_version_link(
                                note=amd_name, url=amd_url, media_type="application/pdf"
                            )
                            version_urls.append(amd_url)
                        else:
                            self.info("Already Added {}, skipping".format(amd_url))

            if "S.J." in action or "SCS" in action:
                actor = "upper"
            elif "H.J." in action or "HCS" in action:
                actor = "lower"
            else:
                actor = "legislature"

            action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip()

            if action.startswith("Introduced"):
                atype = ["introduction"]
                if ", referred to" in action:
                    atype.append("referral-committee")
            elif action.startswith("Read first time"):
                atype = "reading-1"
            elif action.startswith("Referred to"):
                atype = "referral-committee"
            elif action.startswith("Sent to Governor"):
                atype = "executive-receipt"
            elif action.startswith("Reported Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Vetoed by Governor"):
                atype = "executive-veto"
            elif action.startswith("Item veto"):
                atype = "executive-veto-line-item"
            elif re.match(r"Passed (House|Senate)", action):
                atype = "passage"
            elif re.match(r"Amendment (S|H)-\d+ filed", action):
                atype = ["amendment-introduction"]
                if ", adopted" in action:
                    atype.append("amendment-passage")
            elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action):
                atype = "amendment-passage"
            elif re.match(r"Amendment (S|N)-\d+ lost", action):
                atype = "amendment-failure"
            elif action.startswith("Resolution filed"):
                atype = "introduction"
            elif action.startswith("Resolution adopted"):
                atype = "passage"
            elif action.startswith("Committee report") and action.endswith("passage."):
                atype = "committee-passage"
            elif action.startswith("Withdrawn"):
                atype = "withdrawal"
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search(r"END OF \d+ ACTIONS", action):
                continue

            if "$history" not in action:
                bill.add_action(
                    description=action, date=date, chamber=actor, classification=atype
                )

        self.scrape_subjects(bill, bill_id, session, req_session)

        yield bill
Beispiel #23
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            if "(Introduced by request of another party)" in sponsor:
                sponsor = sponsor.replace(
                    " (Introduced by request of another party)", "")
            if sponsor != "":
                b.add_sponsorship(sponsor, "primary", "person", True)

        if "gm" in bill_id.lower():
            b.add_sponsorship("governor", "primary", "person", True)

        self.parse_bill_versions_table(b, versions)
        self.parse_testimony(b, bill_page)
        self.parse_cmte_reports(b, bill_page)

        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
Beispiel #24
0
    def scrape_archive_bills(self, session):
        session_abr = session[0:2]
        url = f"https://www.ilga.gov/legislation/legisnet{session_abr}/{session_abr}gatoc.html"
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        bill_numbers_sections = doc.xpath("//table//a/@href")

        # Contains multiple bills
        for bill_numbers_section_url in bill_numbers_sections:
            bill_section_html = self.get(bill_numbers_section_url).text
            bill_section_doc = lxml.html.fromstring(bill_section_html)
            bill_section_doc.make_links_absolute(bill_numbers_section_url)

            if "/sb" in bill_numbers_section_url or "/sr" in bill_numbers_section_url:
                chamber = "upper"
            else:
                chamber = "lower"

            bills_urls = bill_section_doc.xpath("//blockquote/a/@href")

            # Actual Bill Pages
            for bill_url in bills_urls:

                bill_html = self.get(bill_url).text
                bill_doc = lxml.html.fromstring(bill_html)
                bill_doc.make_links_absolute(bill_url)

                sponsors = bill_doc.xpath('//pre/a[contains(@href, "sponsor")]')

                bill_id = bill_doc.xpath('//font[contains (., "Status of")]')
                if len(bill_id) < 1:
                    bill_id = bill_doc.xpath('//font[contains (., "Summary of")]')
                bill_id = bill_id[0].text_content().split()[-1]

                if "JRCA" in bill_id:
                    classification = "constitutional amendment"
                elif "JR" in bill_id:
                    classification = "joint resolution"
                elif "R" in bill_id:
                    classification = "resolution"
                else:
                    classification = "bill"

                if "status" in bill_url:
                    # Currently on status page, but need info for summary page
                    summary_page_url = bill_doc.xpath(
                        '//a[contains (., "Bill Summary")]/@href'
                    )[0]
                    summary_page_html = self.get(summary_page_url).text
                    summary_page_doc = lxml.html.fromstring(summary_page_html)
                    summary_page_doc.make_links_absolute(summary_page_url)
                else:
                    # Currently on summary page, but need info for status page
                    summary_page_doc = bill_doc
                    summary_page_url = bill_url
                    bill_url = bill_doc.xpath('//a[contains (., "Bill Status")]/@href')[
                        0
                    ]
                    bill_html = self.get(bill_url).text
                    bill_doc = lxml.html.fromstring(bill_html)
                    bill_doc.make_links_absolute(bill_url)

                summary_text = (
                    summary_page_doc.xpath("//pre")[0].text_content().splitlines()
                )
                for x in range(len(summary_text)):
                    line = summary_text[x]
                    if "Short description:" in line:
                        bill_title = summary_text[x + 1]

                bill = Bill(
                    bill_id,
                    legislative_session=session,
                    title=bill_title,
                    chamber=chamber,
                    classification=classification,
                )
                bill.add_source(summary_page_url)
                bill.add_source(url)

                # Sponsors
                for sponsor in sponsors:
                    if sponsor.text_content():
                        bill.add_sponsorship(
                            name=sponsor.text_content(),
                            classification="cosponsor",
                            entity_type="person",
                            primary=False,
                        )

                # Bill version
                version_url = bill_doc.xpath('//a[contains (., "Full Text")]/@href')[0]
                bill.add_version_link(bill_id, version_url, media_type="text/html")

                # Actions
                bill_text = bill_doc.xpath("//pre")
                if bill_text:
                    bill_text = bill_text[0].text_content().splitlines()
                    for x in range(len(bill_text)):
                        line = bill_text[x].split()
                        # Regex is looking for this format: JAN-11-2001 or 99-02-17
                        if line and (
                            re.match(r"\D\D\D-\d\d-\d\d\d\d", line[0])
                            or re.match(r"\d\d-\d\d-\d\d", line[0])
                        ):
                            if session in ["91st", "90th"]:
                                action_date = datetime.datetime.strptime(
                                    line[0], "%y-%m-%d"
                                )
                            else:
                                action_date = datetime.datetime.strptime(
                                    line[0], "%b-%d-%Y"
                                )

                            action_date = central.localize(action_date)
                            action_date = action_date.isoformat()

                            action = " ".join(line[2:])
                            if line[1] == "S":
                                action_chamber = "upper"
                            else:
                                action_chamber = "lower"

                            for pattern, atype in _archived_action_classifiers.items():
                                if action.startswith(pattern):
                                    break
                            else:
                                atype = None
                            bill.add_action(
                                action,
                                action_date,
                                chamber=action_chamber,
                                classification=atype,
                            )

                yield bill
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if ".B. " in bill_id:
            bill_type = "bill"
        elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "):
            bill_type = "resolution"
        elif ".C.R. " in bill_id:
            bill_type = "concurrent resolution"
        elif ".J.R. " in bill_id:
            bill_type = "joint resolution"

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub(r"\s+", " ", bill_id).strip()

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            try:
                (title, name) = [
                    x.strip() for x in info.xpath(".//text()") if x.strip()
                ]
            except ValueError:
                self.warning(
                    "Could not find sponsor's name for {}".format(bill_id))
                continue
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ",
                                                  "").replace("Rep. ", "")
            bill.add_sponsorship(
                floor_sponsor,
                classification="cosponsor",
                entity_type="person",
                primary=False,
            )
        else:
            raise AssertionError("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]')

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get("href")
            if not url:
                url = version.xpath("following-sibling::a[1]/@href")[0]

            bill.add_version_link(version.xpath("text()")[0].strip(),
                                  url,
                                  media_type="application/pdf")

        for related in page.xpath(
                '//b[text()="Related Documents "]/following-sibling::ul/li/'
                'a[contains(@class,"nlink")]'):
            href = related.xpath("@href")[0]
            if ".fn.pdf" in href:
                bill.add_document_link("Fiscal Note",
                                       href,
                                       media_type="application/pdf")
            else:
                text = related.xpath("text()")[0]
                bill.add_document_link(text,
                                       href,
                                       media_type="application/pdf")

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        if page.xpath('//div[@id="billStatus"]//table'):
            status_table = page.xpath('//div[@id="billStatus"]//table')[0]
            yield from self.parse_status(bill, status_table, chamber)

        yield bill
Beispiel #26
0
    def scrape_bill(self, chamber, session, doc_type, url, bill_type=None):
        try:
            html = self.get(url).text
            doc = lxml.html.fromstring(html)
            doc.make_links_absolute(url)
        except scrapelib.HTTPError as e:
            assert (
                "500" in e.args[0]
            ), "Unexpected error when accessing page: {}".format(e)
            self.warning("500 error for bill page; skipping bill")
            return

        # bill id, title, summary
        bill_num = re.findall(r"DocNum=(\d+)", url)[0]
        bill_type = bill_type or DOC_TYPES[doc_type[1:]]
        bill_id = doc_type + bill_num

        title = doc.xpath(
            '//span[text()="Short Description:"]/following-sibling::span[1]/' "text()"
        )[0].strip()
        # 1. Find the heading with "Synopsis As Introduced" for text.
        # 2. Go to the next heading.
        # 3. Backtrack and grab everything to, but not including, #1.
        # 4. Grab text of all, including nested, nodes.
        summary_nodes = doc.xpath(
            '//span[text()="Synopsis As Introduced"]/following-sibling::span[contains(@class, "heading2")]/'
            'preceding-sibling::*[preceding-sibling::span[text()="Synopsis As Introduced"]]//'
            "text()"
        )
        summary = "\n".join([node.strip() for node in summary_nodes])

        bill = Bill(
            identifier=bill_id,
            legislative_session=session,
            title=title,
            classification=bill_type,
            chamber=chamber,
        )

        bill.add_abstract(summary, note="")

        bill.add_source(url)
        # sponsors
        sponsor_list = build_sponsor_list(doc.xpath('//a[contains(@class, "content")]'))
        # don't add just yet; we can make them better using action data

        committee_actors = {}

        # actions
        action_tds = doc.xpath('//a[@name="actions"]/following-sibling::table[1]/td')
        for date, actor, action_elem in group(action_tds, 3):
            date = datetime.datetime.strptime(date.text_content().strip(), "%m/%d/%Y")
            date = self.localize(date).date()
            actor = actor.text_content()
            if actor == "House":
                actor_id = {"classification": "lower"}
            elif actor == "Senate":
                actor_id = {"classification": "upper"}

            action = action_elem.text_content()
            classification, related_orgs = _categorize_action(action)

            # if related_orgs and any(c.startswith("committee") for c in classification):
            #     ((name, source),) = [
            #         (a.text, a.get("href"))
            #         for a in action_elem.xpath("a")
            #         if "committee" in a.get("href")
            #     ]
            #     source = canonicalize_url(source)
            #     actor_id = {"sources__url": source, "classification": "committee"}
            #     committee_actors[source] = name

            bill.add_action(
                action,
                date,
                organization=actor_id,
                classification=classification,
                related_entities=related_orgs,
            )

            if action.lower().find("sponsor") != -1:
                self.refine_sponsor_list(actor, action, sponsor_list, bill_id)

        # now add sponsors
        for spontype, sponsor, chamber, official_type in sponsor_list:
            if official_type == "primary":
                primary = True
            else:
                primary = False
            if chamber:
                bill.add_sponsorship(
                    sponsor, spontype, "person", primary=primary, chamber=chamber
                )
            else:
                bill.add_sponsorship(spontype, sponsor, "person", primary=primary)

        # versions
        version_url = doc.xpath('//a[text()="Full Text"]/@href')[0]
        self.scrape_documents(bill, version_url)
        yield bill

        votes_url = doc.xpath('//a[text()="Votes"]/@href')[0]
        yield from self.scrape_votes(session, bill, votes_url, committee_actors)
Beispiel #27
0
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        archive_year = int(session[0:4])
        not_archive_year = archive_year >= 2009

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id
            if bill_id.strip() == "SB77" and session == "20052006":
                continue

            fsbill = Bill(bill_id, bill_session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()
            summary = ""

            # Get digest test (aka "summary") from latest version.
            if bill.versions and not_archive_year:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            self.warning(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}
                    if not_archive_year:
                        assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            source_url = (
                "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
            )
            source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}"

            # Votes for non archived years
            if archive_year > 2009:
                for vote_num, vote in enumerate(bill.votes):
                    if vote.vote_result == "(PASS)":
                        result = True
                    else:
                        result = False

                    if not vote.location:
                        continue

                    full_loc = vote.location.description
                    first_part = full_loc.split(" ")[0].lower()
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    else:
                        # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment
                        continue

                    if vote.motion:
                        motion = vote.motion.motion_text or ""
                    else:
                        motion = ""

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = []

                    motion = motion.strip()
                    motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.compile(r"^(Senate|Assembly) ",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ",
                                    "", motion)
                    motion = re.sub(r" \(\w+\)$", "", motion)
                    motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                    motion)
                    motion = re.sub(
                        r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                        r"Urgency Clause$",
                        "(Urgency Clause)",
                        motion,
                    )
                    motion = re.sub(r"\s+", " ", motion)

                    if not motion:
                        self.warning("Got blank motion on vote for %s" %
                                     bill_id)
                        continue

                    # XXX this is responsible for all the CA 'committee' votes, not
                    # sure if that's a feature or bug, so I'm leaving it as is...
                    # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                    # org = {
                    # 'name': vote_location,
                    # 'classification': vote_classification
                    # }

                    fsvote = VoteEvent(
                        motion_text=motion,
                        start_date=self._tz.localize(vote.vote_date_time),
                        result="pass" if result else "fail",
                        classification=vtype,
                        # organization=org,
                        chamber=vote_chamber,
                        bill=fsbill,
                    )
                    fsvote.extras = {"threshold": vote.threshold}

                    fsvote.add_source(source_url)
                    fsvote.pupa_id = source_url + "#" + str(vote_num)

                    rc = {"yes": [], "no": [], "other": []}
                    for record in vote.votes:
                        if record.vote_code == "AYE":
                            rc["yes"].append(record.legislator_name)
                        elif record.vote_code.startswith("NO"):
                            rc["no"].append(record.legislator_name)
                        else:
                            rc["other"].append(record.legislator_name)

                    # Handle duplicate votes
                    for key in rc.keys():
                        rc[key] = list(set(rc[key]))

                    for key, voters in rc.items():
                        for voter in voters:
                            fsvote.vote(key, voter)
                        # Set counts by summed votes for accuracy
                        fsvote.set_count(key, len(voters))

                    yield fsvote
            if len(bill.votes) > 0 and archive_year <= 2009:
                vote_page_url = (
                    "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
                )
                vote_page_url += (
                    f"bill_id={session}{bill.session_num}{fsbill.identifier}")

                # parse the bill data page, finding the latest html text
                data = self.get(vote_page_url).content
                doc = html.fromstring(data)
                doc.make_links_absolute(vote_page_url)
                num_of_votes = len(doc.xpath("//div[@class='status']"))
                for vote_section in range(1, num_of_votes + 1):
                    lines = doc.xpath(
                        f"//div[@class='status'][{vote_section}]//div[@class='statusRow']"
                    )
                    date, result, motion, vtype, location = "", "", "", "", ""
                    votes = {}
                    for line in lines:
                        line = line.text_content().split()
                        if line[0] == "Date":
                            date = line[1]
                            date = datetime.datetime.strptime(date, "%m/%d/%y")
                            date = self._tz.localize(date)
                        elif line[0] == "Result":
                            result = "pass" if "PASS" in line[1] else "fail"
                        elif line[0] == "Motion":
                            motion = " ".join(line[1:])
                        elif line[0] == "Location":
                            location = " ".join(line[1:])
                        elif len(line) > 1:
                            if line[0] == "Ayes" and line[1] != "Count":
                                votes["yes"] = line[1:]
                            elif line[0] == "Noes" and line[1] != "Count":
                                votes["no"] = line[1:]
                            elif line[0] == "NVR" and line[1] != "Count":
                                votes["not voting"] = line[1:]
                    # Determine chamber based on location
                    first_part = location.split(" ")[0].lower()
                    vote_chamber = ""
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"
                    if len(motion) > 0:
                        fsvote = VoteEvent(
                            motion_text=motion,
                            start_date=date,
                            result=result,
                            classification=vtype,
                            chamber=vote_chamber,
                            bill=fsbill,
                        )
                        fsvote.add_source(vote_page_url)
                        fsvote.pupa_id = vote_page_url + "#" + str(
                            vote_section)

                        for how_voted, voters in votes.items():
                            for voter in voters:
                                voter = voter.replace(",", "")
                                fsvote.vote(how_voted, voter)
                        yield fsvote

            yield fsbill
            self.session.expire_all()
Beispiel #28
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.get(url).json()
        api_id = page["BillId"]

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(url)

        version_rows = page["Documents"]
        assert len(version_rows) > 0
        for version in version_rows:
            date = version["DocumentDate"]
            if date:
                match = re.match(r"\d{4}-\d{2}-\d{2}", date)
                date = datetime.datetime.strptime(match.group(0),
                                                  "%Y-%m-%d").date()

                html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}"
                pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf"

                note = version["BillVersion"]
                bill.add_version_link(
                    note,
                    html_link,
                    date=date,
                    media_type="text/html",
                    on_duplicate="ignore",
                )
                bill.add_version_link(
                    note,
                    pdf_link,
                    date=date,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )
            else:
                self.warning("Version listed but no date or documents")

        sponsors = page["BillSponsor"]
        if sponsors:
            for sponsor in sponsors:
                sponsor_type = "person"
                member = sponsor["Member"]
                # first and last name are available, but UniqueName is the old link text
                # could change later?

                bill.add_sponsorship(
                    member["UniqueName"],
                    classification="primary",
                    primary=True,
                    entity_type=sponsor_type,
                )
        else:
            sponsor_type = "organization"
            committee_sponsor = re.search(r">(.*)</a>",
                                          page["BillCommitteeSponsor"])[1]
            bill.add_sponsorship(
                committee_sponsor,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        for keyword in page["Keywords"]:
            bill.add_subject(keyword["Keyword"]["Keyword"])

        actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}"
        yield from self.scrape_action(bill, actions_url, chamber)

        yield bill
Beispiel #29
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()
        if not title:
            self.warning("blank bill on %s - skipping", url)
            return

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()
            if "author not found" in name.lower():
                continue

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsorship(
                    name,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
            else:
                bill.add_sponsorship(name,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs["committees"]:
                related_entities.append({"type": "committee", "name": item})
            for item in attrs["legislators"]:
                related_entities.append({"type": "legislator", "name": item})
            bill.add_action(
                description=action,
                date=date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=attrs["classification"],
                related_entities=related_entities,
            )

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if version_url in version_urls:
                self.warning("Skipping duplicate version URL.")
                continue
            else:
                version_urls.append(version_url)

            if link.text is None:
                self.warning("Skipping unnamed version.")
                continue

            name = link.text.strip()

            if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url,
                         re.IGNORECASE):
                bill.add_document_link(note=name,
                                       url=version_url,
                                       media_type="application/pdf")
                continue

            bill.add_version_link(note=name,
                                  url=version_url,
                                  media_type="application/pdf")

        self.scrape_amendments(bill, page)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                yield from self.scrape_votes(
                    bill, self.urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill.title == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Beispiel #30
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                return

        title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        version_ct = self.parse_versions(page, bill)

        if version_ct < 1:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note", source_url, media_type=mimetype)

        for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill