Beispiel #1
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).content
        page = lxml.html.fromstring(html)
        page.make_links_absolute(self.BASE_URL)

        if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()')[
                0
            ].strip()
        elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'):
            bill_id = page.xpath('//h2[@style="font-size:1.3rem;"]/text()')[0].strip()
        else:
            self.warning("No bill id for {}".format(url))
            return
        title = page.xpath(
            '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()'
        )[0].strip()

        if "B" in bill_id:
            _type = ["bill"]
        elif "J" in bill_id:
            _type = ["joint resolution"]
        elif "HS" in bill_id:
            _type = ["resolution"]
        else:
            raise ValueError("unknown bill type " + bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=_type,
        )
        bill.add_source(url)

        self.scrape_bill_subjects(bill, page)
        self.scrape_bill_sponsors(bill, page)
        self.scrape_bill_actions(bill, page)

        # fiscal note
        if page.xpath('//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'):
            fiscal_note = page.xpath(
                '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a'
            )[0]
            fiscal_url = fiscal_note.get("href")
            fiscal_title = fiscal_note.text_content()
            bill.add_document_link(
                fiscal_title, fiscal_url, media_type="application/pdf",
            )

        # effective date, where available
        if page.xpath('//div[contains(text(), "Effective Date(s)")]'):
            eff_date = page.xpath('//div[contains(text(), "Effective Date(s)")]/text()')[0].strip()
            eff_date = eff_date.replace('Effective Date(s):', '').strip()
            # this can contain multiple dates, eg "July 1, 2020, July 1, 2022"
            bill.extras['date_effective'] = eff_date

        # yield from self.parse_bill_votes_new(doc, bill)
        yield bill
Beispiel #2
0
    def scrape_bill(self, chamber, session, url):
        html = self.get(url).text
        page = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = page.xpath(
            '//span[@id="ctl00_CPHBody_txtTitulo"]/text()')[0].strip()
        bill_id = page.xpath(
            '//span[@id="ctl00_CPHBody_txt_Medida"]/text()')[0].strip()

        bill_type = self.classify_bill_type(bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        start_year = session[0:4]
        self.scrape_author_table(start_year, bill, bill_id)

        # action table contains votes, hence the yield
        yield from self.scrape_action_table(chamber, bill, page, url)

        bill.add_source(url)
        yield bill
Beispiel #3
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search("type=(B|R|)", link.attrib["href"]).group(1)

        if type_abbr == "B":
            btype = ["bill"]
        elif type_abbr == "R":
            btype = ["resolution"]

        bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num)

        url = utils.info_url(chamber, session, special, type_abbr, bill_num)
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = "/".join(
            [
                '//div[contains(@class, "BillInfo-ShortTitle")]',
                'div[@class="BillInfo-Section-Data"]',
            ]
        )

        if page.xpath(xpath):
            title = page.xpath(xpath).pop().text_content().strip()
        else:
            self.warning("Skipping {} {}, No title found".format(bill_id, url))
            return

        bill = Bill(
            bill_id,
            legislative_session=session,
            title=title,
            chamber=chamber,
            classification=btype,
        )
        bill.add_source(url)

        self.parse_bill_versions(bill, page)

        self.parse_history(
            bill,
            chamber,
            utils.history_url(chamber, session, special, type_abbr, bill_num),
        )

        # only fetch votes if votes were seen in history
        # if vote_count:
        yield from self.parse_votes(
            bill, utils.vote_url(chamber, session, special, type_abbr, bill_num)
        )

        # Dedupe sources.
        sources = bill.sources
        for source in sources:
            if 1 < sources.count(source):
                sources.remove(source)

        yield bill
Beispiel #4
0
    def parse_bill(self, url):
        xml = self.get(url).content
        xml = ET.fromstring(xml)

        bill_num = self.get_xpath(xml, "bill/billNumber")
        bill_type = self.get_xpath(xml, "bill/billType")

        bill_id = "{} {}".format(bill_type, bill_num)

        chamber_name = self.get_xpath(xml, "bill/originChamber")
        chamber = self.chambers[chamber_name]

        title = self.get_xpath(xml, "bill/title")

        classification = self.classifications[bill_type]

        session = self.get_xpath(xml, "bill/congress")

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=classification,
        )

        self.scrape_actions(bill, xml)
        self.scrape_amendments(bill, xml, session, chamber, bill_id)
        self.scrape_cbo(bill, xml)
        self.scrape_committee_reports(bill, xml)
        self.scrape_cosponsors(bill, xml)
        self.scrape_laws(bill, xml)
        self.scrape_related_bills(bill, xml)
        self.scrape_sponsors(bill, xml)
        self.scrape_subjects(bill, xml)
        self.scrape_summaries(bill, xml)
        self.scrape_titles(bill, xml)
        self.scrape_versions(bill, xml)

        # https://www.congress.gov/bill/116th-congress/house-bill/1
        xml_url = "https://www.govinfo.gov/bulkdata/BILLSTATUS/{congress}/{type}/BILLSTATUS-{congress}{type}{num}.xml"
        bill.add_source(
            xml_url.format(congress=session,
                           type=bill_type.lower(),
                           num=bill_num))

        cg_url = (
            "https://congress.gov/bill/{congress}th-congress/{chamber}-{type}/{num}"
        )
        bill.add_source(
            cg_url.format(
                congress=session,
                chamber=chamber_name.lower(),
                type=classification.lower(),
                num=bill_num,
            ))

        yield bill
def test_invalid_fields_related_item():
    create_jurisdiction()
    p1 = ScrapeBill("HB 1", "2020", "Title")
    p1.add_source("http://example.com")
    p1 = p1.as_dict()
    p1["sources"][0]["test"] = 3

    with pytest.raises(DataImportError):
        BillImporter("jid").import_data([p1])
Beispiel #6
0
    def scrape_bill(self, bill_page_url):
        bill_page = lxml.html.fromstring(self.get(bill_page_url).text)

        title = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()'
        )
        if title:
            title = title[0]
        else:
            self.warning("Missing bill title {}".format(bill_page_url))
            return False

        bill_no = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()'
        )
        if bill_no:
            bill_no = bill_no[0]
        else:
            bill_no = bill_page.xpath(
                '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()'
            )
            if bill_no:
                bill_no = bill_no[0]
            else:
                self.error("Missing bill number {}".format(bill_page_url))
                return False

        bill = Bill(
            bill_no,
            legislative_session=self.session,
            chamber="legislature",
            title=title,
            classification="bill",
        )

        bill.add_source(bill_page_url)

        self.parse_versions(bill, bill_page, bill_no)
        self.parse_acts(bill, bill_page)

        sponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()'
        )
        if sponsors:
            self.assign_sponsors(bill, sponsors[0], "primary")

        cosponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()'
        )
        if cosponsors:
            self.assign_sponsors(bill, cosponsors[0], "cosponsor")

        self.parse_date_actions(bill, bill_page)
        self.parse_actions(bill, bill_page)

        yield bill
Beispiel #7
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial",
                "CMR": "concurrent memorial",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=row[3],
                classification=bill_type,
            )
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(
                    primary,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(" ", "")))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type="application/pdf")

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Beispiel #8
0
 def get_bill(self, bill_id, **kwargs):
     if bill_id == "1":
         assert kwargs == {"extra": "param"}
         raise self.ContinueScraping
     else:
         assert bill_id == "2"
         assert kwargs == {}
         b = Bill("1", self.legislative_session, "title")
         b.add_source("http://example.com")
         return b
def toy_bill():
    b = Bill(
        identifier="HB 2017",
        legislative_session="2012A",
        title="A bill for an act to raise the cookie budget by 200%",
        from_organization="Foo Senate",
        classification="bill",
    )
    b.add_source("http://uri.example.com/", note="foo")
    return b
Beispiel #10
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {"H": "lower", "S": "upper"}

        for row in page:
            if row["sess_year"] != session:
                continue
            bill_id = row["bill_num"]
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            if re.match(r"^(S|H)J", bill_id):
                bill_type = "joint resolution"
            elif re.match(r"^(S|H)R", bill_id):
                bill_type = "resolution"
            else:
                bill_type = "bill"

            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                title=row["bill_title"],
                classification=bill_type,
                chamber=chamber,
            )
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                introducer = string.capwords(
                    introducer.decode("utf-8").replace("Rep. ", "").replace("Sen. ", "")
                )
                if "Dist." in introducer:
                    introducer = " ".join(introducer.split()[:-2])
                bill.add_sponsorship(
                    name=introducer,
                    classification="primary",
                    primary=True,
                    entity_type="person",
                )

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning("no such bill: " + bill_id)
                pass
Beispiel #11
0
    def _recursively_process_bills(
        self, request_session, chamber, session, first_item=1
    ):
        """
        Once a search has been initiated, this function will save a
        Bill object for every Paper from the given chamber
        """

        url = "http://legislature.maine.gov/LawMakerWeb/searchresults.asp"
        r = request_session.get(url, params={"StartWith": first_item})
        r.raise_for_status()

        bills = lxml.html.fromstring(r.text).xpath("//tr/td/b/a")
        seen = set()
        if bills:
            for bill in bills:
                bill_id_slug = bill.xpath("./@href")[0]
                if bill_id_slug == "summary.asp?ID=280068396":
                    continue
                bill_url = "http://legislature.maine.gov/LawMakerWeb/{}".format(
                    bill_id_slug
                )
                bill_id = bill.text[:2] + " " + bill.text[2:]

                if (
                    session in BLACKLISTED_BILL_IDS
                    and bill_id in BLACKLISTED_BILL_IDS[session]
                ):
                    continue

                # avoid duplicates
                if bill_id in seen:
                    continue
                seen.add(bill_id)

                bill = Bill(
                    identifier=bill_id,
                    legislative_session=session,
                    title="",
                    chamber=chamber,
                )
                bill.add_source(bill_url)

                yield from self.scrape_bill(bill, chamber)
                yield bill

            # Make a recursive call to this function, for the next page
            PAGE_SIZE = 25
            yield from self._recursively_process_bills(
                request_session=request_session,
                chamber=chamber,
                session=session,
                first_item=first_item + PAGE_SIZE,
            )
Beispiel #12
0
    def process_page(self):
        chamber = "upper" if self.input.identifier.startswith("S") else "lower"
        short_title = self.get_column_div("Summary").text
        long_title = CSS("#title").match_one(self.root).text

        if "*" in self.input.identifier:
            stars = re.search(r"\*+", self.input.identifier).group()
            if (
                self.input.session in CARRYOVERS
                and stars in CARRYOVERS[self.input.session]
            ):
                self.input.identifier = re.sub(
                    r"\*+",
                    "-" + CARRYOVERS[self.input.session][stars],
                    self.input.identifier,
                )
            else:
                self.logger.error(
                    f"Unidentified carryover bill {self.input.identifier}. Update CARRYOVERS dict in bills.py"
                )
                return

        bill = Bill(
            identifier=self.input.identifier,
            legislative_session=self.input.session,
            title=short_title,
            chamber=chamber,
        )
        bill.subject = self.input.subjects
        # use the pretty source URL
        bill.add_source(self.input.source_url)
        bill.add_title(long_title)

        try:
            sponsors = self.get_column_div("Primary Sponsor")
            self.add_sponsors(bill, CSS("a").match(sponsors), primary=True)
        except SelectorError:
            pass
        try:
            cosponsors = self.get_column_div("Co-Sponsor")
            self.add_sponsors(bill, CSS("a").match(cosponsors), primary=False)
        except SelectorError:
            pass
        # TODO: figure out cosponsor div name, can't find any as of Feb 2021
        self.add_actions(bill, chamber)

        bdr = extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        text_url = self.source.url.replace("Overview", "Text")
        yield BillTabText(bill, source=text_url)
def test_save_object_basics():
    # ensure that save object dumps a file
    s = Scraper(juris, "/tmp/")
    p = Bill("HB 1", "2021", "Test")
    p.add_source("http://example.com")

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    # ensure object is saved in right place
    filename = "bill_" + p._id + ".json"
    assert filename in s.output_names["bill"]
    json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
Beispiel #14
0
def test_whitespace_is_stripped():
    s = Scraper(juris, "/tmp/")
    b = Bill(" HB 11", "2020", " a short title     ")
    b.subject = [" one", "two ", "   three "]
    b.add_source("https://example.com/     ")

    s.save_object(b)

    # the simple cases, and nested lists / objects
    assert b.identifier == "HB 11"
    assert b.title == "a short title"
    assert b.sources[0]["url"] == "https://example.com/"
    assert b.subject == ["one", "two", "three"]
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r"\/Bill\/(\d+)\/Overview", url).group(1)

        # bill data gets filled in from another call
        bill_data_base = (
            "https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/"
            "FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}")
        bill_data_url = bill_data_base.format(session_slug, internal_id,
                                              time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, "Summary:").text
        short_title = short_title.replace("\u00a0", " ")

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber,
        )

        long_title = self.get_header_field(bill_page, "Title:").text
        if long_title is not None:
            bill.add_abstract(long_title, "Summary")

        sponsor_div = self.get_header_field(bill_page, "Primary Sponsor")
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, "primary")

        cosponsor_div = self.get_header_field(bill_page, "Co-Sponsor")
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, "cosponsor")

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras["BDR"] = bdr

        bill.extras["NV_ID"] = internal_id

        bill.add_source(url)
        yield bill
def test_save_related():
    s = Scraper(juris, "/tmp/")
    p = Bill("HB 1", "2021", "Test")
    p.add_source("http://example.com")
    o = Bill("HB 2", "2021", "Test")
    o.add_source("http://example.com")
    p._related.append(o)

    with mock.patch("json.dump") as json_dump:
        s.save_object(p)

    assert json_dump.mock_calls == [
        mock.call(p.as_dict(), mock.ANY, cls=mock.ANY),
        mock.call(o.as_dict(), mock.ANY, cls=mock.ANY),
    ]
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(("S", "H")):
                continue

            # create a bill
            desc = bill.xpath("text()")[0].strip()
            chamber = {"H": "lower", "S": "upper"}[bill_id[0]]
            bill_type = {
                "B": "bill",
                "J": "joint resolution",
                "R": "resolution"
            }[bill_id[1]]
            bill = Bill(
                bill_id,
                self.kwargs["session"],
                desc,
                chamber=chamber,
                classification=bill_type,
            )

            bill_url = link.get("href")
            sponsor_url = BASE_URL + URL_PATTERNS["sponsors"].format(
                self.kwargs["session_id"], bill_id.replace(" ", ""))

            list(
                self.scrape_page_items(BillSponsorPage,
                                       url=sponsor_url,
                                       obj=bill))
            yield from self.scrape_page_items(BillDetailPage,
                                              url=bill_url,
                                              obj=bill)
            bill.subject = self.kwargs["subjects"][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage,
                                              url=next_url[0],
                                              **self.kwargs)
def test_locked_field_subitem():
    create_jurisdiction()
    bill = ScrapeBill("HB 1", "2020", "Title")
    bill.add_source("https://example.com")
    bi = BillImporter("jid")
    bi.import_data([bill.as_dict()])

    # lock the field
    b = Bill.objects.get()
    b.locked_fields = ["sources"]
    b.save()

    # reimport (without source)
    bill = ScrapeBill("HB 1", "2020", "Title")
    bi = BillImporter("jid")
    bi.import_data([bill.as_dict()])

    b = Bill.objects.get()
    assert b.sources.count() == 1
    assert b.locked_fields == ["sources"]
Beispiel #19
0
    def process_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.input["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.subjects[subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        sponsor = re.sub(r",\s+(Jr|Sr)\.", r" \1.", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            sp_type = "organization" if "committee" in sp.lower() else "person"
            bill.add_sponsorship(sp, "primary", sp_type, True)

        return BillDetail(bill)
Beispiel #20
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        bill_json_url = (
            "https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&"
            "legislativeBody={}".format(bill_id, session_id, self.chamber_map[chamber])
        )
        response = self.get(bill_json_url, timeout=80)
        page = json.loads(response.content.decode("utf-8"))

        if not page:
            self.warning("null page for %s", bill_id)
            return

        bill_title = page["ShortTitle"]
        bill_id = page["Number"]
        internal_id = page["BillId"]
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        self.scrape_actions(bill, page, chamber)
        self.scrape_versions_and_documents(bill, internal_id)
        self.scrape_sponsors(bill, internal_id)
        self.scrape_subjects(bill, internal_id)
        yield from self.scrape_votes(bill, page)

        bill_url = (
            "https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}".format(
                internal_id, session_id
            )
        )
        bill.add_source(bill_url)

        bill.actions = sorted(bill.actions, key=lambda action: action["date"])

        yield bill
Beispiel #21
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib["href"] + "/ByCategory"

        if bill_id.startswith(("SB ", "HB ", "SPB ", "HPB ")):
            bill_type = "bill"
        elif bill_id.startswith(("HR ", "SR ")):
            bill_type = "resolution"
        elif bill_id.startswith(("HJR ", "SJR ")):
            bill_type = "joint resolution"
        elif bill_id.startswith(("SCR ", "HCR ")):
            bill_type = "concurrent resolution"
        elif bill_id.startswith(("SM ", "HM ")):
            bill_type = "memorial"
        else:
            raise ValueError("Failed to identify bill type.")

        bill = Bill(
            bill_id,
            self.kwargs["session"],
            title,
            chamber="lower" if bill_id[0] == "H" else "upper",
            classification=bill_type,
        )
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub(r"(H|S)\w+ 0*(\d+)", r"\1\2", bill_id)
        bill.subject = list(self.kwargs["subjects"][subj_bill_id])

        sponsor = re.sub(r"^(?:Rep|Sen)\.\s", "", sponsor)
        for sp in sponsor.split(", "):
            sp = sp.strip()
            bill.add_sponsorship(sp, "primary", "person", True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
    def scrape_prefiles(self, session):
        url = 'https://www.legis.iowa.gov/legislation/billTracking/prefiledBills'
        page = lxml.html.fromstring(self.get(url).content)
        page.make_links_absolute(url)

        for row in page.xpath('//table[contains(@class, "sortable")]/tr[td]'):
            title = row.xpath('td[2]/a/text()')[0].strip()
            url = row.xpath('td[2]/a/@href')[0]

            bill_id = self.extract_doc_id(title)

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber='legislature',
                title=title,
                classification='proposed bill',
            )

            if (row.xpath('td[3]/a')):
                document_url = row.xpath('td[3]/a/@href')[0]
                if '.docx' in document_url:
                    media_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
                elif '.pdf' in document_url:
                    media_type = 'application/pdf'
                bill.add_document_link(
                    note="Backround Statement",
                    url=document_url,
                    media_type=media_type
                )

            bill.add_version_link(
                note="Prefiled",
                url=url,
                media_type="application/pdf"
            )

            bill.add_source(url)

            yield bill
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize("{}{}".format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error("Skipping %s w/ 503", bill_url)
                return
            else:
                raise

        bill_number = page.xpath(
            '//div[contains(@class,"field-name-field-bill-number")]'
            '//div[contains(@class,"field-item even")][1]/text()'
        )[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])'
        )
        bill_summary = bill_summary.replace("Read More", "").strip()
        bill = Bill(
            bill_number, legislative_session=session, chamber=chamber, title=bill_title
        )
        if bill_summary:
            bill.add_abstract(bill_summary, "summary")
        bill.add_source("{}{}".format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(session, bill, page)
Beispiel #24
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        if self.parse_bill_field(page, "Last Action") != "":
            last_action = self.parse_bill_field(
                page, "Last Action").xpath("text()")[0]
            if "WITHDRAWN" in last_action.upper():
                self.info("{} Withdrawn, skipping".format(bill_id))
                return

        title = self.parse_bill_field(page, "Title").text_content()

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        version_ct = self.parse_versions(page, bill)

        if version_ct < 1:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)
        self.parse_proposed_amendments(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            mimetype = get_media_type(source_url)

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath(
                "//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no != "" and bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.get(url).json()
        api_id = page["BillId"]

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(f"https://sdlegislature.gov/Session/Bill/{api_id}")
        bill.add_source(url)

        version_rows = page["Documents"]
        assert len(version_rows) > 0
        for version in version_rows:
            date = version["DocumentDate"]
            if date:
                match = re.match(r"\d{4}-\d{2}-\d{2}", date)
                date = datetime.datetime.strptime(match.group(0),
                                                  "%Y-%m-%d").date()

                html_link = f"https://sdlegislature.gov/Session/Bill/{api_id}/{version['DocumentId']}"
                pdf_link = f"https://mylrc.sdlegislature.gov/api/Documents/{version['DocumentId']}.pdf"

                note = version["BillVersion"]
                bill.add_version_link(
                    note,
                    html_link,
                    date=date,
                    media_type="text/html",
                    on_duplicate="ignore",
                )
                bill.add_version_link(
                    note,
                    pdf_link,
                    date=date,
                    media_type="application/pdf",
                    on_duplicate="ignore",
                )
            else:
                self.warning("Version listed but no date or documents")

        sponsors = page["BillSponsor"]
        if sponsors:
            for sponsor in sponsors:
                sponsor_type = "person"
                member = sponsor["Member"]
                # first and last name are available, but UniqueName is the old link text
                # could change later?

                bill.add_sponsorship(
                    member["UniqueName"],
                    classification="primary",
                    primary=True,
                    entity_type=sponsor_type,
                )
        else:
            sponsor_type = "organization"
            committee_sponsor = re.search(r">(.*)</a>",
                                          page["BillCommitteeSponsor"])[1]
            bill.add_sponsorship(
                committee_sponsor,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        for keyword in page["Keywords"]:
            bill.add_subject(keyword["Keyword"]["Keyword"])

        actions_url = f"https://sdlegislature.gov/api/Bills/ActionLog/{api_id}"
        yield from self.scrape_action(bill, actions_url, chamber)

        yield bill
Beispiel #26
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        bill_id_for_url = bill_id.replace(" ", "")
        bill.add_source(
            f"https://capitol.texas.gov/BillLookup/History.aspx?LegSess={session}&Bill={bill_id_for_url}"
        )

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        for version in root.iterfind(
                "billtext/docTypes/bill/versions/version"):
            if not version:
                continue

            note = version.find("versionDescription").text
            html_url = version.find("WebHTMLURL").text
            bill.add_version_link(note=note,
                                  url=html_url,
                                  media_type="text/html")
            pdf_url = version.find("WebPDFURL").text
            bill.add_version_link(note=note,
                                  url=pdf_url,
                                  media_type="application/pdf")

        for analysis in root.iterfind(
                "billtext/docTypes/analysis/versions/version"):
            if not analysis:
                continue

            description = analysis.find("versionDescription").text
            html_url = analysis.find("WebHTMLURL").text
            bill.add_document_link(
                note="Analysis ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        for fiscal_note in root.iterfind(
                "billtext/docTypes/fiscalNote/versions/version"):
            if not fiscal_note:
                continue

            description = fiscal_note.find("versionDescription").text
            html_url = fiscal_note.find("WebHTMLURL").text
            bill.add_document_link(
                note="Fiscal Note ({})".format(description),
                url=html_url,
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            atype = _categorize_action(desc)

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = "{}{}".format(qs["billtype"], qs["billnumber"])
        versions = bill_page.xpath(
            "//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath(
            '//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath(
            '//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta["Report Title"].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(
            bill_id,
            session,
            meta["Measure Title"],
            chamber=chamber,
            classification=bill_type,
        )
        if meta["Description"]:
            b.add_abstract(meta["Description"], "description")
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = "{} Regular Session".format(str(int(session[:4]) - 1))
        companion = meta["Companion"].strip()
        if companion:
            b.add_related_bill(
                identifier=companion.replace(u"\xa0", " "),
                legislative_session=prior_session,
                relation_type="companion",
            )
        if bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
        ):
            prior = bill_page.xpath(
                "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()"
            )[-1]
            if "carried over" in prior.lower():
                b.add_related_bill(
                    identifier=bill_id.replace(u"\xa0", " "),
                    legislative_session=prior_session,
                    relation_type="companion",
                )
        for sponsor in meta["Introducer(s)"]:
            if "(Introduced by request of another party)" in sponsor:
                sponsor = sponsor.replace(
                    " (Introduced by request of another party)", "")
            b.add_sponsorship(sponsor, "primary", "person", True)

        self.parse_bill_versions_table(b, versions)
        self.parse_testimony(b, bill_page)
        self.parse_cmte_reports(b, bill_page)

        yield from self.parse_bill_actions_table(b, action_table, bill_id,
                                                 session, url, chamber)
        yield b
Beispiel #28
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)
        for category in self._categories:
            leg_listing_url = (
                self._API_BASE_URL + f"BulkData/{category['categoryId']}/{session}"
            )
            resp = requests.post(leg_listing_url, headers=self._headers, verify=False,)
            resp.raise_for_status()
            leg_listing = resp.json()

            for leg in leg_listing:

                bill = Bill(
                    leg["legislationNumber"],
                    legislative_session=session,
                    title=leg["title"],
                    classification=category["name"],
                )
                bill.add_source(leg_listing_url)
                bill_url = (
                    f"https://lims.dccouncil.us/Legislation/{leg['legislationNumber']}"
                )
                bill.add_source(bill_url)

                if leg['lawNumber']:
                    bill.extras['lawNumber'] = leg['lawNumber']

                # Actions
                for hist in leg["legislationHistory"]:
                    hist_date = datetime.datetime.strptime(
                        hist["actionDate"], "%b %d, %Y"
                    )
                    hist_date = self._TZ.localize(hist_date)
                    hist_action = hist["actionDescription"]
                    if hist_action.split()[0] in ["OtherAmendment", "OtherMotion"]:
                        hist_action = hist_action[5:]
                    hist_class = self.classify_action(hist_action)

                    if "mayor" in hist_action.lower():
                        actor = "executive"
                    else:
                        actor = "legislature"
                    bill.add_action(
                        hist_action, hist_date, classification=hist_class, chamber=actor
                    )

                    # Documents with download links
                    if hist["downloadURL"] and ("download" in hist["downloadURL"]):
                        download = hist["downloadURL"]
                        if not download.startswith("http"):
                            download = "https://lims.dccouncil.us/" + download

                        mimetype = (
                            "application/pdf" if download.endswith("pdf") else None
                        )
                        is_version = False
                        # figure out if it's a version from type/name
                        possible_version_types = [
                            "SignedAct",
                            "Introduction",
                            "Enrollment",
                            "Engrossment",
                        ]
                        for vt in possible_version_types:
                            if vt.lower() in download.lower():
                                is_version = True
                                doc_type = vt

                        if "amendment" in download.lower():
                            doc_type = "Amendment"

                        if is_version:
                            bill.add_version_link(
                                doc_type,
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )
                        else:
                            bill.add_document_link(
                                hist["actionDescription"],
                                download,
                                media_type=mimetype,
                                on_duplicate="ignore",
                            )

                # Grabs Legislation details
                leg_details_url = (
                    self._API_BASE_URL
                    + f"LegislationDetails/{leg['legislationNumber']}"
                )
                details_resp = requests.get(
                    leg_details_url, headers=self._headers, verify=False,
                )
                details_resp.raise_for_status()
                leg_details = details_resp.json()

                # Sponsors
                for i in leg_details["introducers"]:
                    name = i["memberName"]
                    bill.add_sponsorship(
                        name,
                        classification="primary",
                        entity_type="person",
                        primary=True,
                    )

                # Co-sponsor
                if leg_details["coSponsors"]:
                    for cs in leg_details["coSponsors"]:
                        name = i["memberName"]
                        bill.add_sponsorship(
                            name,
                            classification="cosponsor",
                            entity_type="person",
                            primary=True,
                        )

                # Committee Hearing Doc
                for commHearing in leg_details["committeeHearing"]:
                    if commHearing["hearingRecord"]:
                        bill.add_document_link(
                            commHearing["hearingType"],
                            commHearing["hearingRecord"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                for committeeMarkup in leg_details["committeeMarkup"]:
                    if committeeMarkup["committeeReport"]:
                        bill.add_document_link(
                            "Committee Markup",
                            committeeMarkup["committeeReport"],
                            media_type="application/pdf",
                            on_duplicate="ignore",
                        )

                # Actions and Votes
                if leg_details["actions"]:
                    # To prevent duplicate votes
                    vote_ids = []
                    for act in leg_details["actions"]:
                        action_name = act["action"]
                        action_date = datetime.datetime.strptime(
                            act["actionDate"][:10], "%Y-%m-%d"
                        )
                        action_date = self._TZ.localize(action_date)

                        if action_name.split()[0] == "Other":
                            action_name = " ".join(action_name.split()[1:])

                        if "mayor" in action_name.lower():
                            actor = "executive"
                        else:
                            actor = "legislature"

                        # Documents and Versions
                        if act["attachment"]:
                            mimetype = (
                                "application/pdf"
                                if act["attachment"].endswith("pdf")
                                else None
                            )
                            is_version = False
                            # figure out if it's a version from type/name
                            possible_version_types = [
                                "SignedAct",
                                "Introduction",
                                "Enrollment",
                                "Engrossment",
                            ]
                            for vt in possible_version_types:
                                if vt.lower() in act["attachment"].lower():
                                    is_version = True
                                    doc_type = vt

                            if "amendment" in act["attachment"].lower():
                                doc_type = "Amendment"

                            if is_version:
                                bill.add_version_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )
                            else:
                                bill.add_document_link(
                                    doc_type,
                                    act["attachment"],
                                    media_type=mimetype,
                                    on_duplicate="ignore",
                                )

                        # Votes
                        if act["voteDetails"]:
                            result = act["voteDetails"]["voteResult"]
                            if result:
                                status = self._vote_statuses[result.lower()]
                                id_text = (
                                    str(leg["legislationNumber"])
                                    + "-"
                                    + action_name
                                    + "-"
                                    + result
                                )
                                if id_text not in vote_ids:
                                    vote_ids.append(id_text)
                                    action_class = self.classify_action(action_name)
                                    v = VoteEvent(
                                        identifier=id_text,
                                        chamber=actor,
                                        start_date=action_date,
                                        motion_text=action_name,
                                        result=status,
                                        classification=action_class,
                                        bill=bill,
                                    )
                                    v.add_source(leg_listing_url)

                                    yes_count = (
                                        no_count
                                    ) = absent_count = abstain_count = other_count = 0
                                    for leg_vote in act["voteDetails"]["votes"]:
                                        mem_name = leg_vote["councilMember"]
                                        if leg_vote["vote"] == "Yes":
                                            yes_count += 1
                                            v.yes(mem_name)
                                        elif leg_vote["vote"] == "No":
                                            no_count += 1
                                            v.no(mem_name)
                                        elif leg_vote["vote"] == "Absent":
                                            absent_count += 1
                                            v.vote("absent", mem_name)
                                        elif leg_vote["vote"] == "Recused":
                                            v.vote("abstain", mem_name)
                                            abstain_count += 1
                                        elif leg_vote["vote"] == "Present":
                                            v.vote("other", mem_name)
                                            other_count += 1
                                        else:
                                            # Incase anything new pops up
                                            other_count += 1
                                            v.vote("other", mem_name)

                                    v.set_count("yes", yes_count)
                                    v.set_count("no", no_count)
                                    v.set_count("absent", absent_count)
                                    v.set_count("abstain", abstain_count)
                                    v.set_count("other", other_count)
                                    yield v

                yield bill
Beispiel #29
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type="text/html",
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type="text/html",
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(
                    self.NAME_SLUGS[fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == "Amended":
                atype = "amendment-passage"
            elif desc == "Amendment(s) offered":
                atype = "amendment-introduction"
            elif desc == "Amendment amended":
                atype = "amendment-amendment"
            elif desc == "Amendment withdrawn":
                atype = "amendment-withdrawal"
            elif desc == "Passed" or desc == "Adopted":
                atype = "passage"
            elif re.match(r"^Received (by|from) the", desc):
                if "Secretary of the Senate" not in desc:
                    atype = "introduction"
                else:
                    atype = "filing"
            elif desc.startswith("Sent to the Governor"):
                # But what if it gets lost in the mail?
                atype = "executive-receipt"
            elif desc.startswith("Signed by the Governor"):
                atype = "executive-signature"
            elif desc.startswith("Effective on"):
                atype = "became-law"
            elif desc == "Vetoed by the Governor":
                atype = "executive-veto"
            elif desc == "Read first time":
                atype = ["introduction", "reading-1"]
                introduced = True
            elif desc == "Read & adopted":
                atype = ["passage"]
                if not introduced:
                    introduced = True
                    atype.append("introduction")
            elif desc == "Passed as amended":
                atype = "passage"
            elif desc.startswith("Referred to") or desc.startswith(
                    "Recommended to be sent to "):
                atype = "referral-committee"
            elif desc == "Reported favorably w/o amendment(s)":
                atype = "committee-passage"
            elif desc == "Filed":
                atype = "filing"
            elif desc == "Read 3rd time":
                atype = "reading-3"
            elif desc == "Read 2nd time":
                atype = "reading-2"
            elif desc.startswith("Reported favorably"):
                atype = "committee-passage-favorable"
            else:
                atype = None

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
Beispiel #30
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r"<.*?>"

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = "http://legislature.vermont.gov/bill/loadBillsReleased/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)["data"] or []

        bills_url = "http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/".format(
            year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)["data"] or [])

        resolutions_url = "http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both".format(
            year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)["data"] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info["BillNumber"].startswith("J.R.H."):
                bill_type = "joint resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("J.R.S."):
                bill_type = "joint resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.C.R."):
                bill_type = "concurrent resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("H.R."):
                bill_type = "resolution"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S.R."):
                bill_type = "resolution"
                bill_chamber = "upper"

            elif info["BillNumber"].startswith("PR."):
                bill_type = "constitutional amendment"
                if info["Body"] == "H":
                    bill_chamber = "lower"
                elif info["Body"] == "S":
                    bill_chamber = "upper"
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info["BillNumber"].startswith("H."):
                bill_type = "bill"
                bill_chamber = "lower"
            elif info["BillNumber"].startswith("S."):
                bill_type = "bill"
                bill_chamber = "upper"

            else:
                raise AssertionError("Unknown bill type found: '{}'".format(
                    info["BillNumber"]))

            bill_id_original_format = (info["BillNumber"].replace(".",
                                                                  "").replace(
                                                                      " ", ""))

            bill_id = bill_id_original_format

            # put one space back in between type and number
            bill_id = re.sub(r"([a-zA-Z]+)(\d+)", r"\1 \2", bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info["Title"],
                classification=bill_type,
            )
            if "resolution" in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = "http://legislature.vermont.gov/bill/status/{0}/{1}".format(
                year_slug, info["BillNumber"])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                "following-sibling::dd[1]/ul/li")
            sponsor_type = "primary"
            for sponsor in sponsors:
                if sponsor.xpath("span/text()") == ["Additional Sponsors"]:
                    sponsor_type = "cosponsor"
                    continue

                sponsor_name = (sponsor.xpath("a/text()")[0].replace(
                    "Rep.", "").replace("Sen.", "").strip())
                if sponsor_name and not (sponsor_name[:5] == "Less"
                                         and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type="person",
                        primary=(sponsor_type == "primary"),
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                "following-sibling::dd[1]/ul/li/a |"
                '//ul[@class="bill-path"]//a')

            for version in versions:
                if version.xpath("text()"):
                    bill.add_version_link(
                        note=version.xpath("text()")[0],
                        url=version.xpath("@href")[0].replace(" ", "%20"),
                        media_type="application/pdf",
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode("utf-8"),
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(
                    info["BillNumber"]))
                yield bill
                continue

            # Capture actions
            actions_url = "http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}".format(
                year_slug, internal_bill_id)
            actions_json = self.get(actions_url)

            # Checks if page actually has json posted
            if "json" in actions_json.headers.get("Content-Type"):
                actions = json.loads(actions_json.text)["data"]
                # Checks to see if any data is actually there
                if actions == "":
                    continue
            else:
                continue
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action["FullStatus"]:
                    actor = "executive"
                elif action["ChamberCode"] == "H":
                    actor = "lower"
                elif action["ChamberCode"] == "S":
                    actor = "upper"
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action["FullStatus"]:
                    # assert chambers_passed == set("HS")
                    action_type = "executive-signature"
                elif "Vetoed by the Governor" in action["FullStatus"]:
                    action_type = "executive-veto"
                elif ("Read first time" in action["FullStatus"]
                      or "Read 1st time" in action["FullStatus"]):
                    action_type = "introduction"
                elif "Reported favorably" in action["FullStatus"]:
                    action_type = "committee-passage-favorable"
                elif actor == "lower" and any(
                        x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("H")
                elif actor == "upper" and any(
                        x.lower().startswith(" aspassed")
                        or x.lower().startswith("aspassed")
                        for x in action["keywords"].split(";")):
                    action_type = "passage"
                    chambers_passed.add("S")
                else:
                    action_type = None

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.511
                action["StatusDate"] = action["StatusDate"].replace(
                    "/0209", "/2019")

                # Manual fix for data error in
                # https://legislature.vermont.gov/bill/status/2020/H.754
                if bill_id == "H 754" and session == "2019-2020":
                    action["StatusDate"] = action["StatusDate"].replace(
                        "/0202", "/2020")

                # https://legislature.vermont.gov/bill/status/2020/H.942
                if bill_id == "H 942" and session == "2019-2020":
                    action["StatusDate"] = action["StatusDate"].replace(
                        "/0200", "/2020")

                action_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(action["StatusDate"],
                                               "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                # strftime doesn't always pad year value (%Y)  (https://bugs.python.org/issue32195)
                # and sometimes this state has typos in year part of the StatusDate value
                # which can cause validation errors, so fix leading zeroes if they are missing
                if action_date.find("-") < 4:
                    action_date = ("0" *
                                   (4 - action_date.find("-"))) + action_date

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action["FullStatus"]),
                    date=action_date,
                    chamber=actor,
                    classification=action_type,
                )

            # Capture votes
            votes_url = "http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}".format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)["data"]
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote["VoteHeaderID"]
                roll_call_url = ("http://legislature.vermont.gov/bill/"
                                 "loadBillRollCallDetails/{0}/{1}".format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)["data"]

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name,
                     _district) = member["MemberName"].split(" of ")
                    member_name = member_name.strip()

                    if member["MemberVote"] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member["MemberVote"] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote["FullStatus"]
                        # seems like we've seen both
                        or "Governor overridden" in vote["FullStatus"] or
                        "Governor overriden" in vote["FullStatus"]):
                    did_pass = True
                elif ("Failed -- " in vote["FullStatus"] or
                      "Veto of the Governor sustained" in vote["FullStatus"]):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear: " +
                                         vote["FullStatus"])

                # Check vote counts
                yea_count = int(
                    re.search(r"Yeas = (\d+)", vote["FullStatus"]).group(1))
                nay_count = int(
                    re.search(r"Nays = (\d+)", vote["FullStatus"]).group(1))

                vote_start_date = datetime.datetime.strftime(
                    datetime.datetime.strptime(vote["StatusDate"], "%m/%d/%Y"),
                    "%Y-%m-%d",
                )
                motion_text = re.sub(HTML_TAGS_RE, "",
                                     vote["FullStatus"]).strip()
                vote_identifer = (vote["StatusDate"] + "--" + motion_text +
                                  "--" + roll_call_url)
                vote_to_add = VoteEvent(
                    identifier=vote_identifer,
                    bill=bill,
                    chamber=("lower"
                             if vote["ChamberCode"] == "H" else "upper"),
                    start_date=vote_start_date,
                    motion_text=motion_text,
                    result="pass" if did_pass else "fail",
                    classification="passage",
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count("yes", yea_count)
                vote_to_add.set_count("no", nay_count)
                vote_to_add.set_count("not voting", len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote("not voting", member)

                yield vote_to_add

            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            witnesses_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/witnesses".format(
                bill_id_original_format)
            bill.add_document_link(note="Witness List",
                                   url=witnesses_doc_link_url,
                                   media_type="text/html")

            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            conferees_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/conference".format(
                bill_id_original_format)
            page = self.lxmlize(conferees_doc_link_url)
            no_data = page.xpath('//div[@class="no-data"]/text()')
            if not no_data:
                bill.add_document_link(
                    note="Conference Committee Members",
                    url=conferees_doc_link_url,
                    media_type="text/html",
                )

            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}
            meetings_doc_link_url = "https://legislature.vermont.gov/bill/print/2020/{0}/meetings".format(
                bill_id_original_format)
            bill.add_document_link(
                note="Committee Meetings",
                url=meetings_doc_link_url,
                media_type="text/html",
            )

            yield bill