Ejemplo n.º 1
0
    def process_page(self):
        chamber = "upper" if self.input.identifier.startswith("S") else "lower"
        short_title = self.get_column_div("Summary").text
        long_title = CSS("#title").match_one(self.root).text

        if "*" in self.input.identifier:
            stars = re.search(r"\*+", self.input.identifier).group()
            if (
                self.input.session in CARRYOVERS
                and stars in CARRYOVERS[self.input.session]
            ):
                self.input.identifier = re.sub(
                    r"\*+",
                    "-" + CARRYOVERS[self.input.session][stars],
                    self.input.identifier,
                )
            else:
                self.logger.error(
                    f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py"
                )
                return

        bill = Bill(
            identifier=self.input.identifier,
            legislative_session=self.input.session,
            title=short_title,
            chamber=chamber,
        )
        bill.subject = self.input.subjects
        # use the pretty source URL
        bill.add_source(self.input.source_url)
        bill.add_title(long_title)

        try:
            sponsors = self.get_column_div("Primary Sponsor")
            self.add_sponsors(bill, CSS("a").match(sponsors), primary=True)
        except SelectorError:
            pass
        try:
            cosponsors = self.get_column_div("Co-Sponsor")
            self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False)
        except SelectorError:
            pass
        # TODO: figure out cosponsor div name, can't find any as of Feb 2021
        self.add_actions(bill, chamber)

        bdr = extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        text_url = self.source.url.replace("Overview", "Text")
        yield BillTabText(bill, source=text_url)
Ejemplo n.º 2
0
def test_whitespace_is_stripped():
    s = Scraper(juris, "/tmp/")
    b = Bill(" HB 11", "2020", " a short title     ")
    b.subject = [" one", "two ", "   three "]
    b.add_source("https://example.com/     ")

    s.save_object(b)

    # the simple cases, and nested lists / objects
    assert b.identifier == "HB 11"
    assert b.title == "a short title"
    assert b.sources[0]["url"] == "https://example.com/"
    assert b.subject == ["one", "two", "three"]
Ejemplo n.º 3
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1)

        # bill data gets filled in from another call
        bill_data_base = (
            "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/"
            "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}")
        bill_data_url = bill_data_base.format(session_slug, internal_id,
                                              time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, "Summary:").text
        short_title = short_title.replace("\u00a0", " ")

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber,
        )

        long_title = self.get_header_field(bill_page, "Title:").text
        if long_title is not None:
            bill.add_abstract(long_title, "Summary")

        sponsor_div = self.get_header_field(bill_page, "Primary Sponsor")
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, "primary")

        cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor")
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, "cosponsor")

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        bill.extras["NV_ID"] = internal_id

        bill.add_source(url)
        yield bill
Ejemplo n.º 4
0
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(("S", "H")):
                continue

            # create a bill
            desc = bill.xpath("text()")[0].strip()
            chamber = {"H": "lower", "S": "upper"}[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            bill = Bill(
                bill_id,
                self.kwargs["session"],
                desc,
                chamber=chamber,
                classification=bill_type,
            )

            bill_url = link.get("href")
            sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format(
                self.kwargs["session_id"], bill_id.replace(" ", ""))

            list(
                self.scrape_page_items(BillSponsorPage,
                                       url=sponsor_url,
                                       obj=bill))
            yield from self.scrape_page_items(BillDetailPage,
                                              url=bill_url,
                                              obj=bill)
            bill.subject = self.kwargs["subjects"][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage,
                                              url=next_url[0],
                                              **self.kwargs)
Ejemplo n.º 5
0
    def process_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.input["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.subjects[subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        sponsor = re.sub(r",\s+(Jr|Sr)\.", r" \1.", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            sp_type = "organization" if "committee" in sp.lower() else "person"
            bill.add_sponsorship(sp, "primary", sp_type, True)

        return BillDetail(bill)
Ejemplo n.º 6
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.kwargs["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.kwargs["subjects"][subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            bill.add_sponsorship(sp, "primary", "person", True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
Ejemplo n.º 7
0
    def scrape_bill_list(self, chamber, session, url):
        if "joint_resolution" in url:
            bill_type = "joint resolution"
        elif "resolution" in url:
            bill_type = "resolution"
        elif "bill" in url:
            bill_type = "bill"

        try:
            data = self.get(url).text
        except scrapelib.HTTPError:
            self.warning("skipping URL %s" % url)
            return
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(url)
        bill_list = doc.xpath('//ul[@class="infoLinks"]/li/div[@class="row-fluid"]')
        for b in bill_list:
            bill_url = b.xpath('./div[@class="span3"]/a/@href')[0]
            bill_id = bill_url.rsplit("/", 1)[-1]
            bill_id = bill_id.upper()

            title = (
                b.xpath('./div[@class="span6"]/text()')[0]
                .replace(" - Relating to: ", "")
                .strip()
            )

            bill = Bill(
                bill_id,
                legislative_session=session,
                title=title,
                chamber=chamber,
                classification=bill_type,
            )
            bill.subject = list(set(self.subjects[bill_id]))
            yield from self.scrape_bill_history(bill, bill_url, chamber)

            yield bill
Ejemplo n.º 8
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath(
            '//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath(
            '//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bid == "XXXXXX":
            self.info("Skipping Junk Bill")
            return

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber="upper",
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note="abstract")
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        try:
            sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        except IndexError:
            sponsor = bill_page.xpath('//span[@id="lSponsor"]')[0]

        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(bill_sponsor,
                             entity_type="person",
                             classification="primary",
                             primary=True)

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get("href"):
            self._parse_senate_cosponsors(bill,
                                          cosponsor_tag[0].attrib["href"])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib["href"]
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get("href"):
            self._parse_senate_bill_versions(bill,
                                             versions_url[0].attrib["href"])

        amendment_links = bill_page.xpath(
            '//a[contains(@href,"ShowAmendment.asp")]')
        for link in amendment_links:
            link_text = link.xpath("string(.)").strip()
            if "adopted" in link_text.lower():
                link_url = link.xpath("@href")[0]
                bill.add_version_link(
                    link_text,
                    link_url,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )

        yield bill
Ejemplo n.º 9
0
    def scrape_bills(self, chamber, session, subjects):
        idex = bill_start_numbers(session)[chamber]
        FROM = "ctl00$rilinContent$txtBillFrom"
        TO = "ctl00$rilinContent$txtBillTo"
        YEAR = "ctl00$rilinContent$cbYear"
        blocks = "FOO"  # Ugh.
        while len(blocks) > 0:
            default_headers = get_default_headers(SEARCH_URL)
            default_headers[FROM] = idex
            default_headers[TO] = idex + MAXQUERY
            default_headers[YEAR] = session
            idex += MAXQUERY
            blocks = self.parse_results_page(
                self.post(SEARCH_URL, data=default_headers).text)
            blocks = blocks[1:-1]
            blocks = self.digest_results_page(blocks)

            for block in blocks:
                bill = blocks[block]
                subs = []
                try:
                    subs = subjects[bill["bill_id"]]
                except KeyError:
                    pass

                title = bill["title"][len("ENTITLED, "):]
                billid = bill["bill_id"]
                try:
                    subs = subjects[bill["bill_id"]]
                except KeyError:
                    subs = []

                for b in BILL_NAME_TRANSLATIONS:
                    if billid[:len(b)] == b:
                        billid = (BILL_NAME_TRANSLATIONS[b] +
                                  billid[len(b) + 1:].split()[0])

                b = Bill(
                    billid,
                    title=title,
                    chamber=chamber,
                    legislative_session=session,
                    classification=self.get_type_by_name(bill["bill_id"]),
                )
                b.subject = subs

                # keep bill ID around
                self._bill_id_by_type[(chamber,
                                       re.findall(r"\d+", billid)[0])] = billid

                self.process_actions(bill["actions"], b)
                sponsors = bill["sponsors"][len("BY"):].strip()
                sponsors = sponsors.split(",")
                sponsors = [s.strip() for s in sponsors]

                for href in bill["bill_id_hrefs"]:
                    b.add_version_link(href.text,
                                       href.attrib["href"],
                                       media_type="application/pdf")

                for sponsor in sponsors:
                    b.add_sponsorship(
                        sponsor,
                        entity_type="person",
                        classification="primary",
                        primary=True,
                    )

                b.add_source(SEARCH_URL)
                yield b
Ejemplo n.º 10
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath("td[1]/font/input/@value")
            (sponsor, ) = bill_info.xpath("td[2]/font/input/@value")
            (subject, ) = bill_info.xpath("td[3]//text()")
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if "B" in bill_id:
                bill_type = "bill"
            elif "JR" in bill_id:
                bill_type = "joint resolution"
            elif "R" in bill_id:
                bill_type = "resolution"
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title="",
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type="person",
                    classification="primary",
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ("http://alisondb.legislature.state.al.us/Alison/"
                        "SESSBillStatusResult.aspx?BILL={}".format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning(
                    "Bill {} has no webpage, and will be skipped".format(
                        bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'):
                title = (bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]')
                         [0].text_content().strip())
            if not title:
                title = "[No title given by state]"
            bill.title = title
            session = "2022FS" if self.session == "2022s1" else self.session

            version_url_base = (
                "http://alisondb.legislature.state.al.us/ALISON/"
                "SearchableInstruments/{0}/PrintFiles/{1}-".format(
                    session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + "int.pdf"
                elif version == "Engrossed":
                    version_url = version_url_base + "eng.pdf"
                elif version == "Enrolled":
                    version_url = version_url_base + "enr.pdf"
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath("td[1]")[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath("td[2]/font/text()")[0], self.DATE_FORMAT)
                bir_type = bir.xpath("td[1]/font/text()")[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type,
                    bir.xpath("td[3]/font/text()")[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification="other",
                )

                try:
                    (bir_vote_id, ) = bir.xpath("td[4]/font/input/@value")
                except ValueError:
                    bir_vote_id = ""

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text,
                    )

            actions = bill_doc.xpath(
                '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath("td[1]/font/text()")[0].encode(
                        "ascii", "ignore").strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath("td[1]/font/text()")[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath("td[2]/font/text()")

                possible_amendment = action.xpath("td[3]/font/u/text()")
                if (len(possible_amendment) > 0
                        and not possible_amendment[0].strip() == ""):
                    (amendment, ) = possible_amendment
                else:
                    amendment = None

                (action_text, ) = action.xpath("td[4]/font/text()")

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = (re.search(
                        r".*? referred to the .*? committee on (.*?)$",
                        action_text).group(1).strip())
                except AttributeError:
                    action_committee = ""

                if action_date is not None and action_text.strip():
                    act = bill.add_action(
                        action_text,
                        TIMEZONE.localize(action_date),
                        chamber=actor,
                        classification=action_type,
                    )
                    if action_committee:
                        act.add_related_entity(action_committee,
                                               entity_type="organization")

                    try:
                        vote_button = action.xpath("td[9]//text()")[0].strip()
                    except IndexError:
                        vote_button = ""

                    if vote_button.startswith("Roll "):
                        vote_id = vote_button.split(" ")[-1]

                        yield from self.scrape_vote(
                            bill=bill,
                            vote_chamber=action_chamber,
                            bill_id=bill_id,
                            vote_id=vote_id,
                            vote_date=TIMEZONE.localize(action_date),
                            action_text=action_text,
                        )

                if amendment:
                    session = "2021FS" if self.session == "2021s1" else self.session
                    amend_url = (
                        "http://alisondb.legislature.state.al.us/ALISON/"
                        "SearchableInstruments/{0}/PrintFiles/{1}.pdf".format(
                            session, amendment))

                    amend_name = "Amd/Sub {}".format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type="application/pdf",
                        on_duplicate="ignore",
                    )

            yield bill
Ejemplo n.º 11
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if ".B. " in bill_id:
            bill_type = "bill"
        elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "):
            bill_type = "resolution"
        elif ".C.R. " in bill_id:
            bill_type = "concurrent resolution"
        elif ".J.R. " in bill_id:
            bill_type = "joint resolution"

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub(r"\s+", " ", bill_id).strip().replace(".", "")

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            try:
                (title, name) = [
                    x.strip() for x in info.xpath(".//text()") if x.strip()
                ]
            except ValueError:
                self.warning(
                    "Could not find sponsor's name for {}".format(bill_id))
                continue
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ",
                                                  "").replace("Rep. ", "")
            bill.add_sponsorship(
                floor_sponsor,
                classification="cosponsor",
                entity_type="person",
                primary=False,
            )
        else:
            self.warning("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]')

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get("href")
            if not url:
                url = version.xpath("following-sibling::a[1]/@href")[0]

            bill.add_version_link(version.xpath("text()")[0].strip(),
                                  url,
                                  media_type="application/pdf")

        for related in page.xpath(
                '//b[text()="Related Documents "]/following-sibling::ul/li/'
                'a[contains(@class,"nlink")]'):
            href = related.xpath("@href")[0]
            if ".fn.pdf" in href:
                bill.add_document_link("Fiscal Note",
                                       href,
                                       media_type="application/pdf")
            else:
                text = related.xpath("text()")[0]
                bill.add_document_link(text,
                                       href,
                                       media_type="application/pdf")

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        if page.xpath('//div[@id="billStatus"]//table'):
            status_table = page.xpath('//div[@id="billStatus"]//table')[0]
            yield from self.parse_status(bill, status_table, chamber)

        yield bill
Ejemplo n.º 12
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = "%s/GetLegislation?biennium=%s&billNumber" "=%s" % (
            self._base_url,
            self.biennium,
            bill_num,
        )

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        xml_chamber = xpath(page, "string(wa:OriginalAgency)")
        chamber = self._chamber_map[xml_chamber]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page, "string(wa:ShortLegislationType/wa:LongLegislationType)"
        )
        bill_type = bill_type.lower()

        if bill_type == "gubernatorial appointment":
            return

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=[bill_type],
        )
        fake_source = (
            "http://apps.leg.wa.gov/billinfo/"
            "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])
        )

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(
                    note=version["note"],
                    url=version["url"],
                    media_type=version["media_type"],
                )
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(
                    note=document["note"],
                    url=document["url"],
                    media_type=document["media_type"],
                )
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, chamber, fake_source)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Ejemplo n.º 13
0
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(name="Adam Smith")
    lower = Organization.objects.create(jurisdiction_id="jid",
                                        name="House",
                                        classification="lower")
    Membership.objects.create(person_id=person.id, organization_id=lower.id)
    Organization.objects.create(
        jurisdiction_id="jid",
        name="Arbitrary Committee",
        classification="committee",
        parent=lower,
    )

    oldbill = ScrapeBill(
        "HB 99",
        "1899",
        "Axe & Tack Tax Act",
        classification="tax bill",
        chamber="lower",
    )

    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      classification="tax bill",
                      chamber="lower")
    bill.subject = ["taxes", "axes"]
    bill.add_identifier("SB 9")
    bill.add_title("Tack & Axe Tax Act")
    bill.add_action("introduced in house", "1900-04-01", chamber="lower")
    act = bill.add_action("sent to arbitrary committee",
                          "1900-04-04",
                          chamber="lower")
    act.add_related_entity(
        "arbitrary committee",
        "organization",
        _make_pseudo_id(name="Arbitrary Committee"),
    )
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship(
        "Adam Smith",
        classification="extra sponsor",
        entity_type="person",
        primary=False,
        entity_id=_make_pseudo_id(name="Adam Smith"),
    )
    bill.add_sponsorship("Jane Smith",
                         classification="lead sponsor",
                         entity_type="person",
                         primary=True)
    bill.add_abstract(
        "This is an act about axes and taxes and tacks.",
        note="official",
        date="1969-10-20",
    )
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.pdf",
                           media_type="application/pdf")
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.html",
                           media_type="text/html")
    bill.add_version_link("Fiscal Note",
                          "http://example.com/v/1",
                          media_type="text/html")
    bill.add_source("http://example.com/source")

    # import bill
    BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier="HB 1")
    assert b.from_organization.classification == "lower"
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ["taxes", "axes"]
    assert b.abstracts.get().note == "official"
    assert b.abstracts.get().date == "1969-10-20"

    # other_title, other_identifier added
    assert b.other_titles.get().title == "Tack & Axe Tax Act"
    assert b.other_identifiers.get().identifier == "SB 9"

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification="lower")
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert actions[1].related_entities.get(
    ).organization == Organization.objects.get(classification="committee")

    # action computed fields
    assert b.first_action_date == "1900-04-01"
    assert b.latest_action_date == "1900-04-04"
    assert b.latest_action_description == "sent to arbitrary committee"

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == "HB 99"

    # and bill got resolved
    assert rb.related_bill.identifier == "HB 99"

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name="Adam Smith")
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Ejemplo n.º 14
0
    def scrape_bill(
        self,
        session,
        chamber,
        bill_id,
        title,
        url,
        strip_sponsors=re.compile(r"\s*\(.{,50}\)\s*").sub,
    ):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)

        xpath = '//strong[contains(., "SUBJECT")]/../' "following-sibling::td/a/text()"
        bill.subject = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version_link(**version)

        self.scrape_amendments(page, bill)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath("td/strong/text()")
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, "").strip()
            values[heading] = value

        # summary was always same as title
        # bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors("", values.get("LEAD SPONSOR:", ""))
        if primary:
            bill.add_sponsorship(
                name=primary,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        # Add cosponsors.
        if values.get("SPONSORS:"):
            sponsors = strip_sponsors("", values["SPONSORS:"])
            sponsors = re.split(r", (?![A-Z]\.)", sponsors)
            for name in sponsors:
                name = name.strip(", \n\r")
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search(r"(.+?), ([DM]\. Hall)", name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsorship(
                                name=name,
                                classification="cosponsor",
                                entity_type="person",
                                primary=False,
                            )
                    else:
                        bill.add_sponsorship(
                            name=name,
                            classification="cosponsor",
                            entity_type="person",
                            primary=False,
                        )

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            yield from self.scrape_house_vote(bill, link.attrib["href"])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath("td")
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {"S": "upper", "H": "lower"}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith("passed senate"):
                for href in tds[1].xpath("a/@href"):
                    yield from self.scrape_senate_vote(bill, href, date)

            attrs = dict(chamber=chamber,
                         description=action,
                         date=date.strftime("%Y-%m-%d"))
            temp = self.categorizer.categorize(action)
            related_entities = []
            for key, values in temp.items():
                if key != "classification":
                    for value in values:
                        related_entities.append({"type": key, "name": value})
            attrs.update(classification=temp["classification"],
                         related_entities=related_entities)
            bill.add_action(**attrs)

        yield bill
Ejemplo n.º 15
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = "%s/%s" % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace("Bill.aspx", "BillContent.aspx")
        url = url.replace("&code=R", "&code=R&style=new")

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath(
            '//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath("//table/tr")
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == "Co-Sponsor:":
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == "LR Number:"
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if (table_rows[4 + cosponsorOffset][0].text_content().strip() ==
                "Governor Action:"):
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == "Bill String:"
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]

        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[3:].strip())
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = "Appropriations Bill"
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title))
                return

        bill = Bill(
            bill_id,
            chamber="lower",
            title=bill_desc,
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note="official")

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(bill_sponsor,
                             entity_type="person",
                             classification="primary",
                             primary=True)

        # check for cosponsors
        (sponsors_url,
         ) = bill_page.xpath("//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        (actions_link,
         ) = bill_page.xpath("//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = "%s%s" % (self._house_base_url,
                                 doc_tag[0].attrib["href"])
            bill.add_document_link(doc, text_url, media_type="text/html")

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == "PDF":
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_version_link(
                    version,
                    vurl.attrib["href"],
                    media_type=mimetype,
                    on_duplicate="ignore",
                )

        # house bill versions
        # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Text")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath(
                    './/div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                if ".pdf" in path:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_version_link(version,
                                      path,
                                      media_type=mimetype,
                                      on_duplicate="ignore")

        # house bill summaries
        # everything between the row containing "Bill Summary" in an h2
        # and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Summary")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[h2[contains(text(),"Bill Summary")]]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath(
                './/div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = "Bill Summary ({})".format(version)
                if ".pdf" in path:
                    mimetype = "application/pdf"
                else:
                    mimetype = "text/html"
                bill.add_document_link(summary_name,
                                       path,
                                       media_type=mimetype,
                                       on_duplicate="ignore")

        # house bill amendments
        amendment_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Amendment")]]/'
            'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip(
                )
            path = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip(
                )
            summary_name = "Amendment {}".format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = "{} (Defeated)".format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = "{} (Adopted)".format(summary_name)

            distributed_icon = row.xpath(
                './/img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = "{} (Distributed)".format(summary_name)

            if ".pdf" in path:
                mimetype = "application/pdf"
            else:
                mimetype = "text/html"
            bill.add_version_link(summary_name,
                                  path,
                                  media_type=mimetype,
                                  on_duplicate="ignore")

        yield bill
Ejemplo n.º 16
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()
        if not title:
            self.warning("blank bill on %s - skipping", url)
            return

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()
            if "author not found" in name.lower():
                continue

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsorship(
                    name,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
            else:
                bill.add_sponsorship(name,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs["committees"]:
                related_entities.append({"type": "committee", "name": item})
            for item in attrs["legislators"]:
                related_entities.append({"type": "legislator", "name": item})
            bill.add_action(
                description=action,
                date=date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=attrs["classification"],
                related_entities=related_entities,
            )

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if version_url in version_urls:
                self.warning("Skipping duplicate version URL.")
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url,
                         re.IGNORECASE):
                bill.add_document_link(note=name,
                                       url=version_url,
                                       media_type="application/pdf")
                continue

            bill.add_version_link(note=name,
                                  url=version_url,
                                  media_type="application/pdf")

        self.scrape_amendments(bill, page)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                yield from self.scrape_votes(
                    bill, self.urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill.title == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Ejemplo n.º 17
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
            session[:4],
            bill_id.replace(" ", "-"),
        )
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if (
            "Page Not Found" in html
            or "The bill you are looking for is not available yet" in html
        ):
            url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
                session[-4:],
                bill_id.replace(" ", "-"),
            )
            html = self.get(url).text
            if (
                "Page Not Found" in html
                or "The bill you are looking for is not available yet" in html
            ):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute("http://legislature.mi.gov")

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[
            0
        ].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(" ")[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u"\xa0", " ")
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    "primary"
                    if sponsor.tail and "primary" in sponsor.tail
                    else "cosponsor"
                )
            else:
                classification = "primary"
            bill.add_sponsorship(
                name=name.strip(),
                chamber=chamber,
                entity_type="person",
                primary=classification == "primary",
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath("td")  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            try:
                date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%y"))
            except ValueError:
                try:
                    date = TIMEZONE.localize(
                        datetime.datetime.strptime(date, "%m/%d/%Y")
                    )
                except ValueError:
                    self.warning(
                        "{} has action with invalid date. Skipping Action".format(
                            bill_id
                        )
                    )
                    continue
            # use journal for actor
            # then fall back to upper/lower case
            # Journal entries are often posted with 'Expected Soon' as the cite,
            # then changed to the journal entry.
            if "SJ" in journal.upper():
                actor = "upper"
            elif "HJ" in journal.upper():
                actor = "lower"
            elif action.split()[0].islower():
                actor = "lower"
            elif action.split()[0].isupper():
                actor = "upper"
            else:
                actor = "legislature"

            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(
                r"WITH SUBSTITUTE\s+([\w\-\d]+)", action, re.IGNORECASE
            )
            if submatch and tds[2].xpath("a"):
                version_url = tds[2].xpath("a/@href")[0]
                version_name = tds[2].xpath("a/text()")[0].strip()
                version_name = "Substitute {}".format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith(".pdf"):
                    mimetype = "application/pdf"
                elif version_url.lower().endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath("a/@href")
                if journal_link:
                    objectname = journal_link[0].rsplit("=", 1)[-1]
                    chamber_name = {"upper": "Senate", "lower": "House"}[actor]
                    vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % (
                        session,
                        chamber_name,
                        objectname,
                    )
                    results = self.parse_roll_call(vote_url, rc_num, session)

                    if results is not None:
                        vote_passed = len(results["yes"]) > len(results["no"])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result="pass" if vote_passed else "fail",
                            classification="passage",
                        )

                        # check the expected counts vs actual
                        count = re.search(r"YEAS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["yes"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d"
                                % (bill_id, action, count, len(results["yes"]))
                            )
                        count = re.search(r"NAYS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["no"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d"
                                % (bill_id, action, count, len(results["no"]))
                            )

                        vote.set_count("yes", len(results["yes"]))
                        vote.set_count("no", len(results["no"]))
                        vote.set_count("other", len(results["other"]))
                        possible_vote_results = ["yes", "no", "other"]
                        for pvr in possible_vote_results:
                            for name in results[pvr]:
                                if session == "2017-2018":
                                    names = name.split("\t")
                                    for n in names:
                                        vote.vote(pvr, name.strip())
                                else:
                                    # Prevents voter names like "House Bill No. 4451, entitled" and other sentences
                                    if len(name.split()) < 5:
                                        vote.vote(pvr, name.strip())
                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" % (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith(".pdf"):
                    mimetype = "application/pdf"
                elif url.endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Ejemplo n.º 18
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        withdrawn = False

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                withdrawn = True

        if withdrawn:
            title = "Withdrawn."
        else:
            title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        self.parse_versions(page, bill)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note", source_url, media_type=mimetype)

        # only grab links in the first table, because proposed amendments have sponsors that are not bill sponsors.
        for link in page.xpath(
            "//div[contains(@class,'bill-table')][1]//td/span/a[contains(@href, 'Legislator-Profile')]"
        ):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath("//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        if self.parse_bill_field(page, "Summary of Original Version") != "":
            summary = (
                self.parse_bill_field(page, "Summary of Original Version")
                .text_content()
                .strip()
            )
            bill.add_abstract(summary, note="Summary of Original Version")

        if withdrawn:
            action = self.parse_bill_field(page, "Last Action").text_content().strip()
            wd_date = re.findall(r"\d{2}\/\d{2}\/\d+", action)[0]
            wd_date = dateutil.parser.parse(wd_date).date()
            bill.add_action(
                action, wd_date, chamber=chamber, classification="withdrawal"
            )

        yield bill
Ejemplo n.º 19
0
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        archive_year = int(session[0:4])
        not_archive_year = archive_year >= 2009

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id
            if bill_id.strip() == "SB77" and session == "20052006":
                continue

            fsbill = Bill(bill_id, bill_session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()
            summary = ""

            # Get digest test (aka "summary") from latest version.
            if bill.versions and not_archive_year:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            self.warning(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}
                    if not_archive_year:
                        assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            source_url = (
                "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
            )
            source_url += f"bill_id={session}{bill.session_num}{fsbill.identifier}"

            # Votes for non archived years
            if archive_year > 2009:
                for vote_num, vote in enumerate(bill.votes):
                    if vote.vote_result == "(PASS)":
                        result = True
                    else:
                        result = False

                    if not vote.location:
                        continue

                    full_loc = vote.location.description
                    first_part = full_loc.split(" ")[0].lower()
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"
                        # vote_location = ' '.join(full_loc.split(' ')[1:])
                    else:
                        # raise ScrapeError("Bad location: %s" % full_loc) # To uncomment
                        continue

                    if vote.motion:
                        motion = vote.motion.motion_text or ""
                    else:
                        motion = ""

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"

                    motion = motion.strip()
                    motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.compile(r"^(Senate|Assembly) ",
                                        re.IGNORECASE).sub("", motion)
                    motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ",
                                    "", motion)
                    motion = re.sub(r" \(\w+\)$", "", motion)
                    motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                    motion)
                    motion = re.sub(
                        r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                        r"Urgency Clause$",
                        "(Urgency Clause)",
                        motion,
                    )
                    motion = re.sub(r"\s+", " ", motion)

                    if not motion:
                        self.warning("Got blank motion on vote for %s" %
                                     bill_id)
                        continue

                    # XXX this is responsible for all the CA 'committee' votes, not
                    # sure if that's a feature or bug, so I'm leaving it as is...
                    # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                    # org = {
                    # 'name': vote_location,
                    # 'classification': vote_classification
                    # }

                    fsvote = VoteEvent(
                        motion_text=motion,
                        start_date=self._tz.localize(vote.vote_date_time),
                        result="pass" if result else "fail",
                        classification=vtype,
                        # organization=org,
                        chamber=vote_chamber,
                        bill=fsbill,
                    )
                    fsvote.extras = {"threshold": vote.threshold}

                    fsvote.add_source(source_url)
                    fsvote.pupa_id = source_url + "#" + str(vote_num)

                    rc = {"yes": [], "no": [], "other": []}
                    for record in vote.votes:
                        if record.vote_code == "AYE":
                            rc["yes"].append(record.legislator_name)
                        elif record.vote_code.startswith("NO"):
                            rc["no"].append(record.legislator_name)
                        else:
                            rc["other"].append(record.legislator_name)

                    # Handle duplicate votes
                    for key in rc.keys():
                        rc[key] = list(set(rc[key]))

                    for key, voters in rc.items():
                        for voter in voters:
                            fsvote.vote(key, voter)
                        # Set counts by summed votes for accuracy
                        fsvote.set_count(key, len(voters))

                    yield fsvote
            if len(bill.votes) > 0 and archive_year <= 2009:
                vote_page_url = (
                    "http://leginfo.legislature.ca.gov/faces/billVotesClient.xhtml?"
                )
                vote_page_url += (
                    f"bill_id={session}{bill.session_num}{fsbill.identifier}")

                # parse the bill data page, finding the latest html text
                data = self.get(vote_page_url).content
                doc = html.fromstring(data)
                doc.make_links_absolute(vote_page_url)
                num_of_votes = len(doc.xpath("//div[@class='status']"))
                for vote_section in range(1, num_of_votes + 1):
                    lines = doc.xpath(
                        f"//div[@class='status'][{vote_section}]//div[@class='statusRow']"
                    )
                    date, result, motion, vtype, location = "", "", "", "", ""
                    votes = {}
                    for line in lines:
                        line = line.text_content().split()
                        if line[0] == "Date":
                            date = line[1]
                            date = datetime.datetime.strptime(date, "%m/%d/%y")
                            date = self._tz.localize(date)
                        elif line[0] == "Result":
                            result = "pass" if "PASS" in line[1] else "fail"
                        elif line[0] == "Motion":
                            motion = " ".join(line[1:])
                        elif line[0] == "Location":
                            location = " ".join(line[1:])
                        elif len(line) > 1:
                            if line[0] == "Ayes" and line[1] != "Count":
                                votes["yes"] = line[1:]
                            elif line[0] == "Noes" and line[1] != "Count":
                                votes["no"] = line[1:]
                            elif line[0] == "NVR" and line[1] != "Count":
                                votes["not voting"] = line[1:]
                    # Determine chamber based on location
                    first_part = location.split(" ")[0].lower()
                    vote_chamber = ""
                    if first_part in ["asm", "assembly"]:
                        vote_chamber = "lower"
                    elif first_part.startswith("sen"):
                        vote_chamber = "upper"

                    if "Third Reading" in motion or "3rd Reading" in motion:
                        vtype = "passage"
                    elif "Do Pass" in motion:
                        vtype = "passage"
                    else:
                        vtype = "other"
                    if len(motion) > 0:
                        fsvote = VoteEvent(
                            motion_text=motion,
                            start_date=date,
                            result=result,
                            classification=vtype,
                            chamber=vote_chamber,
                            bill=fsbill,
                        )
                        fsvote.add_source(vote_page_url)
                        fsvote.pupa_id = vote_page_url + "#" + str(
                            vote_section)

                        for how_voted, voters in votes.items():
                            for voter in voters:
                                voter = voter.replace(",", "")
                                fsvote.vote(how_voted, voter)
                        yield fsvote

            yield fsbill
            self.session.expire_all()
Ejemplo n.º 20
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(
                page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                return

        title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        version_ct = self.parse_versions(page, bill)

        if version_ct < 1:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath(
                "//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Ejemplo n.º 21
0
    def scrape_actions(self, session, href):
        page = self.lxmlize(href)

        (bid,) = page.xpath('//h1[@id="page-title"]/text()')
        bid = re.sub(r"^Bill Actions for ", "", bid)
        subjects = self.subjects.get(bid, [])

        # some pages say "Measure Number Breakdown", others "Bill..."
        table = page.xpath("//table[contains(@summary, 'Number Breakdown')]")
        table = table[0]
        ttrows = page.xpath("//div[@id='application']/p")
        descr = ttrows[-2]

        title = re.sub(r"\s+", " ", descr.text_content()).strip()
        ttrows = ttrows[:-1]

        chamber = {"H": "lower", "S": "upper"}[bid[0]]

        type_ = bid[1:3]
        bill_type = "bill"
        if type_.startswith("B"):
            bill_type = "bill"

        if type_.startswith("R"):
            bill_type = "resolution"

        if type_ == "CR":
            bill_type = "concurrent resolution"

        bill = Bill(
            bid,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = subjects
        bill.add_source(href)

        for row in ttrows:
            if isinstance(row, lxml.html.HtmlComment):
                continue  # ignore HTML comments, no text_content()
            sponsors = row.text_content().strip()
            sinf = re.match(
                r"(?i)introduced by( (rep\.|sen\.))? (?P<sponsors>.*)", sponsors
            )
            if sinf:
                sponsors = sinf.groupdict()
                for sponsor in [x.strip() for x in sponsors["sponsors"].split(",")]:
                    bill.add_sponsorship(
                        sponsor,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

        dt = None
        oldchamber = "other"
        for row in table.xpath(".//tr"):
            if row.text_content().strip() == "":
                continue

            if "Meeting Description" in [x.strip() for x in row.xpath(".//th/text()")]:
                continue

            row = row.xpath("./*")
            row = [x.text_content().strip() for x in row]

            if len(row) > 3:
                row = row[:3]

            date, chamber, action = row

            try:
                chamber = {"House": "lower", "Senate": "upper"}[chamber]
                oldchamber = chamber
            except KeyError:
                chamber = oldchamber

            if date != "":
                dt = datetime.strptime("%s %s" % (date, self.year), "%m/%d %Y")

            classif = self.categorizer.categorize(action)

            bill.add_action(
                chamber=chamber,
                description=action,
                date=dt.strftime("%Y-%m-%d"),
                classification=classif["classification"],
            )

        version_url = page.xpath("//a[contains(text(), 'Versions')]")
        if len(version_url) == 1:
            href = version_url[0].attrib["href"]
            bill = self.scrape_versions(bill, href)

        yield bill