Ejemplo n.º 1
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[:subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = (page.xpath('//span[@id="lblCompNumber"]/a')
                            [0].text_content().strip())
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (page.xpath("//span[@id='lblBillPrimeSponsor']")
                   [0].text_content().split("by")[-1])
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(sponsor,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link("Current Version",
                              btext.get("href"),
                              media_type="application/pdf")

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_version_link(
                "Amendment " + amendment.text,
                amendment.get("href"),
                media_type="application/pdf",
            )
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(afn.get("alt"),
                                   afn.getparent().get("href"),
                                   on_duplicate="ignore")

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")
                [0].text_content().split("by")[-1])
            secondary_sponsor = (secondary_sponsor.replace("*", "").replace(
                ")", "").strip())
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            if page.xpath("//table[@id='gvCoActionHistory']"):
                cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
                actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(name="Adam Smith")
    lower = Organization.objects.create(jurisdiction_id="jid",
                                        name="House",
                                        classification="lower")
    Membership.objects.create(person_id=person.id, organization_id=lower.id)
    Organization.objects.create(
        jurisdiction_id="jid",
        name="Arbitrary Committee",
        classification="committee",
        parent=lower,
    )

    oldbill = ScrapeBill(
        "HB 99",
        "1899",
        "Axe & Tack Tax Act",
        classification="tax bill",
        chamber="lower",
    )

    bill = ScrapeBill("HB 1",
                      "1900",
                      "Axe & Tack Tax Act",
                      classification="tax bill",
                      chamber="lower")
    bill.subject = ["taxes", "axes"]
    bill.add_identifier("SB 9")
    bill.add_title("Tack & Axe Tax Act")
    bill.add_action("introduced in house", "1900-04-01", chamber="lower")
    act = bill.add_action("sent to arbitrary committee",
                          "1900-04-04",
                          chamber="lower")
    act.add_related_entity(
        "arbitrary committee",
        "organization",
        _make_pseudo_id(name="Arbitrary Committee"),
    )
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship(
        "Adam Smith",
        classification="extra sponsor",
        entity_type="person",
        primary=False,
        entity_id=_make_pseudo_id(name="Adam Smith"),
    )
    bill.add_sponsorship("Jane Smith",
                         classification="lead sponsor",
                         entity_type="person",
                         primary=True)
    bill.add_abstract(
        "This is an act about axes and taxes and tacks.",
        note="official",
        date="1969-10-20",
    )
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.pdf",
                           media_type="application/pdf")
    bill.add_document_link("Fiscal Note",
                           "http://example.com/fn.html",
                           media_type="text/html")
    bill.add_version_link("Fiscal Note",
                          "http://example.com/v/1",
                          media_type="text/html")
    bill.add_source("http://example.com/source")

    # import bill
    BillImporter("jid").import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier="HB 1")
    assert b.from_organization.classification == "lower"
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ["taxes", "axes"]
    assert b.abstracts.get().note == "official"
    assert b.abstracts.get().date == "1969-10-20"

    # other_title, other_identifier added
    assert b.other_titles.get().title == "Tack & Axe Tax Act"
    assert b.other_identifiers.get().identifier == "SB 9"

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification="lower")
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert actions[1].related_entities.get(
    ).organization == Organization.objects.get(classification="committee")

    # action computed fields
    assert b.first_action_date == "1900-04-01"
    assert b.latest_action_date == "1900-04-04"
    assert b.latest_action_description == "sent to arbitrary committee"

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == "HB 99"

    # and bill got resolved
    assert rb.related_bill.identifier == "HB 99"

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name="Adam Smith")
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1