Esempio n. 1
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(
                created_after=datetime.datetime(2015, 5, 17)):
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date']:
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(
                self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try:
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError:
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []):
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title:  #these are ominbus
                    bill.add_related_bill(
                        identifier=related_bill['label'],
                        legislative_session=bill.legislative_session,
                        relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other

            for i, sponsor in enumerate(leg_details.get('Sponsors', [])):
                if i == 0:
                    primary = True
                    sponsorship_type = "Primary"
                else:
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 'Mendoza, Susana')):
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm', )):
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(
                    ('Misc. Transmittal', 'No Sponsor', 'Dept./Agency')):
                    bill.add_sponsorship(
                        sponsor_name,
                        sponsorship_type,
                        entity_type,
                        primary,
                        entity_id=make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details:
                for subject in leg_details[u'Topic'].split(','):
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []):
                if attachment['label']:
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']):
                action_description = action['Action']
                try:
                    action_date = self.toTime(
                        action['Date']).date().isoformat()
                except AttributeError:  # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description:
                    try:
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError:
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council':
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=ACTION_CLASSIFICATION[
                            action_description])

                    if action_description == 'Referred':
                        try:
                            leg_details[
                                'Current Controlling Legislative Body'][
                                    'label']
                            controlling_bodies = [
                                leg_details[
                                    'Current Controlling Legislative Body']
                            ]
                        except TypeError:
                            controlling_bodies = leg_details[
                                'Current Controlling Legislative Body']
                        if controlling_bodies:
                            for controlling_body in controlling_bodies:
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee"):
                                    act.add_related_entity(
                                        body_name, 'organization')
                                else:
                                    act.add_related_entity(
                                        body_name,
                                        'organization',
                                        entity_id=make_pseudo_id(
                                            name=body_name))

                    if 'url' in action['Action\xa0Details']:
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result:  # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(
                                legislative_session=bill.legislative_session,
                                motion_text=action_description,
                                organization={'name': responsible_org},
                                classification=None,
                                start_date=action_date,
                                result=result,
                                bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes:
                                action_vote.vote(option, voter)

                            yield action_vote

            yield bill
        print(unreachable_urls)
Esempio n. 2
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE']
                    and bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            # An "original sponsor" is the API's expression of "primary sponsor"
            for primary_sponsor in bill_data['ORIGINAL_SPONSOR']:
                bill.add_sponsorship(name=primary_sponsor,
                                     entity_type='organization' if "committee"
                                     in primary_sponsor.lower() else 'person',
                                     primary=True,
                                     classification="original sponsor")
            for sponsor in bill_data['SPONSOR_NAMES']:
                if sponsor in bill_data['ORIGINAL_SPONSOR']:
                    continue
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='organization'
                    if "committee" in sponsor.lower() else 'person',
                    primary=False,
                    classification='cosponsor',
                )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate' else 'lower')

                date = event['session_date']
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning(
                        'unknown action code on %s: %s %s' %
                        (bill_id, event['action_code'], event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(action,
                                date,
                                chamber=actor,
                                classification=atype)

            # Versions are exposed in `bill_data['versions'],
            # but lack any descriptive text or identifiers;
            # continue to scrape these from the HTML
            try:
                yield from self.scrape_html(bill, session)
            except scrapelib.HTTPError as e:
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))

            yield bill
Esempio n. 3
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
            session[:4], bill_id.replace(' ', '-'))
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if ('Page Not Found' in html or
                'The bill you are looking for is not available yet' in html):
            url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % (
                session[-4:], bill_id.replace(' ', '-'))
            html = self.get(url).text
            if ('Page Not Found' in html or
                    'The bill you are looking for is not available yet' in html):
                self.warning("Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute('http://legislature.mi.gov')

        title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(' ')[0][1:]]

        bill = Bill(bill_id, session, title, chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u'\xa0', ' ')
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = (
                    'primary'
                    if sponsor.tail and 'primary' in sponsor.tail
                    else 'cosponsor'
                )
            else:
                classification = 'primary'
            bill.add_sponsorship(
                name=name,
                chamber=chamber,
                entity_type='person',
                primary=classification == 'primary',
                classification=classification,
            )

        bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath('td')  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = 'upper' if 'SJ' in journal else 'lower'
            classification = categorize_action(action)
            bill.add_action(action, date, chamber=actor, classification=classification)

            # check if action mentions a sub
            submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE)
            if submatch and tds[2].xpath('a'):
                version_url = tds[2].xpath('a/@href')[0]
                version_name = tds[2].xpath('a/text()')[0].strip()
                version_name = 'Substitute {}'.format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif version_url.lower().endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(version_name, version_url, media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath('a/@href')
                if journal_link:
                    objectname = journal_link[0].rsplit('=', 1)[-1]
                    chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor]
                    vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % (
                        session, chamber_name, objectname)
                    results = self.parse_roll_call(vote_url, rc_num)

                    if results is not None:
                        vote_passed = len(results['yes']) > len(results['no'])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result='pass' if vote_passed else 'fail',
                            classification='passage',
                        )

                        # check the expected counts vs actual
                        count = re.search(r'YEAS (\d+)', action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results['yes']):
                            self.warning('vote count mismatch for %s %s, %d != %d' %
                                         (bill_id, action, count, len(results['yes'])))
                        count = re.search(r'NAYS (\d+)', action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results['no']):
                            self.warning('vote count mismatch for %s %s, %d != %d' %
                                         (bill_id, action, count, len(results['no'])))

                        vote.set_count('yes', len(results['yes']))
                        vote.set_count('no', len(results['no']))
                        vote.set_count('other', len(results['other']))

                        for name in results['yes']:
                            vote.yes(name)
                        for name in results['no']:
                            vote.no(name)
                        for name in results['other']:
                            vote.vote('other', name)

                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith('.pdf'):
                    mimetype = 'application/pdf'
                elif url.endswith('.htm'):
                    mimetype = 'text/html'
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Esempio n. 4
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = lxml.html.fromstring(self.get(status_url).text)
        # see 2007 HB 2... weird.
        bill_re = r'.*?/([A-Z]+)0*(\d+)\.pdf'
        bill_xpath = '//a[contains(@href, ".pdf") and contains(@href, "billpdf")]/@href'
        bill_id = re.search(bill_re,
                            status_page.xpath(bill_xpath)[0],
                            re.IGNORECASE).groups()
        bill_id = "{0} {1}".format(bill_id[0], int(bill_id[1]))

        try:
            xp = '//b[text()="Short Title:"]/../following-sibling::td/text()'
            title = status_page.xpath(xp).pop()
        except IndexError:
            title = status_page.xpath('//tr[1]/td[2]')[0].text_content()

        # Add bill type.
        _bill_id = bill_id.lower()
        if 'b' in _bill_id:
            classification = 'bill'
        elif 'j' in _bill_id or 'jr' in _bill_id:
            classification = 'joint resolution'
        elif 'cr' in _bill_id:
            classification = 'concurrent resolution'
        elif 'r' in _bill_id:
            classification = 'resolution'

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=classification)

        self.add_actions(bill, status_page)
        votes = self.add_votes(bill, status_page, status_url)

        tabledata = self._get_tabledata(status_page)

        # Add sponsor info.
        bill.add_sponsorship(tabledata['primary sponsor:'][0],
                             classification='primary',
                             entity_type='person',
                             primary=True)

        # A various plus fields MT provides.
        plus_fields = [
            'requester', ('chapter number:', 'chapter'), 'transmittal date:',
            'drafter', 'fiscal note probable:', 'bill draft number:',
            'preintroduction required:', 'by request of', 'category:'
        ]

        for x in plus_fields:
            if isinstance(x, tuple):
                _key, key = x
            else:
                _key = key = x
                key = key.replace(' ', '_')

            try:
                val = tabledata[_key]
            except KeyError:
                continue

            if len(val) == 1:
                val = val[0]

            bill.extras[key] = val

        # Add bill subjects.
        xp = '//th[contains(., "Revenue/Approp.")]/ancestor::table/tr'
        subjects = []
        for tr in status_page.xpath(xp):
            try:
                subj = tr.xpath('td')[0].text_content()
            except:
                continue
            subjects.append(subj)

        for s in subjects:
            bill.add_subject(s)

        self.add_fiscal_notes(status_page, bill)

        return bill, list(votes)
Esempio n. 5
0
    def parse_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        last_action = self.parse_bill_field(page,
                                            'Last Action').xpath('text()')[0]
        if 'WITHDRAWN' in last_action.upper():
            self.info("{} Withdrawn, skipping".format(bill_id))
            return

        version = self.parse_bill_field(page, 'Bill Documents')
        source_url = version.xpath('a[1]/@href')[0]
        version_title = version.xpath('a[1]/text()')[0].strip()

        if version is None:
            # Bill withdrawn
            self.logger.warning('Bill withdrawn.')
            return
        else:
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

        title = self.parse_bill_field(page, 'Title').text_content()

        # actions = self.get_nodes(
        #     page,
        #     '//div[@class="StandardText leftDivMargin"]/'
        #     'div[@class="StandardText"][last()]//text()[normalize-space()]')

        if 'CR' in bill_id:
            bill_type = 'concurrent resolution'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'
        else:
            bill_type = 'bill'

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        bill.add_version_link(version_title, source_url, media_type=mimetype)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib['href']
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(link.text.strip(),
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        bdr_no = self.parse_bill_field(page, 'Bill Request Number')
        if bdr_no.xpath('text()'):
            bdr = bdr_no.xpath('text()')[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Esempio n. 6
0
    def old_scrape(self, session=None):
        status_report_url = "https://www.legislature.ohio.gov/legislation/status-reports"

        # ssl verification off due Ohio not correctly implementing SSL
        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        doc = self.get(status_report_url).text
        doc = lxml.html.fromstring(doc)
        doc.make_links_absolute(status_report_url)
        xpath = "//div[contains(text(),'{}')]/following-sibling::table"
        status_table = doc.xpath(xpath.format(session))[0]
        status_links = status_table.xpath(
            ".//a[contains(text(),'Excel')]/@href")

        for url in status_links:

            try:
                fname, resp = self.urlretrieve(url)
            except scrapelib.HTTPError as report:
                self.logger.warning("Missing report {}".format(report))
                continue

            sh = xlrd.open_workbook(fname).sheet_by_index(0)

            # once workbook is open, we can remove tempfile
            os.remove(fname)
            for rownum in range(1, sh.nrows):
                bill_id = sh.cell(rownum, 0).value

                bill_type = "resolution" if "R" in bill_id else "bill"
                chamber = "lower" if "H" in bill_id else "upper"

                bill_title = str(sh.cell(rownum, 3).value)

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=bill_title,
                            classification=bill_type)
                bill.add_source(url)
                bill.add_sponsor('primary', str(sh.cell(rownum, 1).value))

                # add cosponsor
                if sh.cell(rownum, 2).value:
                    bill.add_sponsor('cosponsor',
                                     str(sh.cell(rownum, 2).value))

                actor = ""

                # Actions start column after bill title
                for colnum in range(4, sh.ncols - 1):
                    action = str(sh.cell(0, colnum).value)
                    cell = sh.cell(rownum, colnum)
                    date = cell.value

                    if len(action) != 0:
                        if action.split()[0] == 'House':
                            actor = "lower"
                        elif action.split()[0] == 'Senate':
                            actor = "upper"
                        elif action.split()[-1] == 'Governor':
                            actor = "executive"
                        elif action.split()[0] == 'Gov.':
                            actor = "executive"
                        elif action.split()[-1] == 'Gov.':
                            actor = "executive"

                    if action in ('House Intro. Date', 'Senate Intro. Date'):
                        atype = ['bill:introduced']
                        action = action.replace('Intro. Date', 'Introduced')
                    elif action == '3rd Consideration':
                        atype = ['bill:reading:3', 'bill:passed']
                    elif action == 'Sent to Gov.':
                        atype = ['governor:received']
                    elif action == 'Signed By Governor':
                        atype = ['governor:signed']
                    else:
                        atype = ['other']

                    if type(date) == float:
                        date = str(xlrd.xldate_as_tuple(date, 0))
                        date = datetime.datetime.strptime(
                            date, "(%Y, %m, %d, %H, %M, %S)")
                        date = self._tz.localize(date)
                        date = "{:%Y-%m-%d}".format(date)
                        bill.add_action(actor, action, date, type=atype)

                for idx, char in enumerate(bill_id):
                    try:
                        int(char)
                    except ValueError:
                        continue

                    underscore_bill = bill_id[:idx] + "_" + bill_id[idx:]
                    break

                yield from self.scrape_votes_old(bill, underscore_bill,
                                                 session)
                self.scrape_versions_old(bill, underscore_bill, session)
                yield bill
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution', 9: 'petition'}
        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \
                             'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]'
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=bill_type)

                bill.subject = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version_link(note=version_name, url=version_url,
                                          media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(classification='primary',
                                         name=leg, entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(classification='cosponsor',
                                         name=leg, entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document_link(note=minutes_date, url=minutes_url)
                    minutes_count += 1

                self.scrape_actions(root, bill, "lower")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
    def scrape_matter(self, matter_link, sess):
        matter_types = {
            "Additions": "other",
            "Administrative Order": "order",
            "Annual Evaluation": "other",
            "Bid Advertisement": "other",
            "Bid Awards": "other",
            "Bid Contract": "contract",
            "Bid Protest": "other",
            "Bid Rejection": "other",
            "Birthday Scroll": "commemoration",
            "Certificate of Appreciation": "commemoration",
            "Change Order": "order",
            "Citizen's Presentation": "other",
            "Commendation": "commemoration",
            "Conflict Waiver": "other",
            "Congratulatory Certificate": "commemoration",
            "Deferrals": "other",
            "Discussion Item": "other",
            "Distinguished Visitor": "other",
            "Joint Meeting/Workshop": "other",
            "Mayoral Veto": "other",
            "Miscellaneous": "other",
            "Nomination": "nomination",
            "Oath of Office": "other",
            "Omnibus Reserve": "bill",
            "Ordinance": "ordinance",
            "Plaque": "commemoration",
            "Presentation": "other",
            "Proclamation": "proclamation",
            "Professional Service Agreement": "contract",
            "Public Hearing": "other",
            "Report": "other",
            "Request for Proposals": "other",
            "Request for Qualifications": "other",
            "Request to Advertise": "other",
            "Resolution": "resolution",
            "Resolution of Sympathy": "resolution",
            "Service Awards": "commemoration",
            "Special Item": "other",
            "Special Presentation": "other",
            "Supplement": "other",
            "Swearing-In": "other",
            "Time Sensitive Items": "other",
            "Withdrawals": "other",
            "Workshop Item": "other",
            "Zoning": "other",
            "Zoning Resolution": "resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y")
        session = sess["identifier"]
        category = matter_types[info_dict["File Type"]]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title)
        else:
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title,
                        classification=category)
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name, spons_type = spons.rsplit(",", 1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name, spons_type, entity, primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"], note=note)
        self.process_action_table(matter_doc, bill)
        bill.add_source(matter_link, note='web')

        yield bill
Esempio n. 9
0
    def scrape(self, window=30):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        self.retry_wait_seconds = 20

        for matter in self.matters(n_days_ago):
            matter_id = matter["MatterId"]

            date = matter["MatterIntroDate"]
            title = matter["MatterTitle"]
            identifier = matter["MatterFile"]

            # If a bill has a duplicate action item that"s causing the entire scrape
            # to fail, add it to the `problem_bills` array to skip it.
            # For the time being...nothing to skip!

            problem_bills = []

            if identifier in problem_bills:
                continue

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))

            if matter["MatterTypeName"] in BILL_TYPES:
                ocd_bill_type = BILL_TYPES[matter["MatterTypeName"]]
            else:
                ocd_bill_type = None

            if identifier.startswith("S"):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=ocd_bill_type,
                        from_organization={"name": "Pittsburgh City Council"})

            legistar_web = matter["legistar_url"]
            legistar_api = "http://webapi.legistar.com/v1/pittsburgh/matters/{0}".format(matter_id)
            bill.add_source(legistar_web, note="web")
            bill.add_source(legistar_api, note="api")

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop("responsible person")
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(responsible_person,
                                           "person",
                                           entity_id=_make_pseudo_id(name=responsible_person))

                if action["description"] == "Referred":
                    body_name = matter["MatterBodyName"]
                    if body_name != "City Council":
                        act.add_related_entity(body_name,
                                               "organization",
                                               entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote

                if result:
                    vote_event = VoteEvent(legislative_session=bill.legislative_session,
                                           motion_text=action["description"],
                                           organization=action["organization"],
                                           classification=None,
                                           start_date=action["date"],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + "/histories")

                    for vote in votes:
                        raw_option = vote["VoteValueName"].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option,
                                        vote["VotePersonName"].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic["MatterIndexName"].strip())

            for attachment in self.attachments(matter_id):
                if attachment["MatterAttachmentName"]:
                    bill.add_version_link(attachment["MatterAttachmentName"],
                                          attachment["MatterAttachmentHyperlink"],
                                          media_type="application/pdf")

            bill.extras = {"local_classification": matter["MatterTypeName"]}
            text = self.text(matter_id)

            if text:
                if text["MatterTextPlain"]:
                    bill.extras["plain_text"] = text["MatterTextPlain"]

                if text["MatterTextRtf"]:
                    bill.extras["rtf_text"] = text["MatterTextRtf"].replace(u"\u0000", "")

            yield bill
    def get_bill(self, matter):
        '''Make Bill object from given matter.'''
        matter_id = matter['MatterId']
        if matter_id in DUPLICATED_ACTIONS:
            return None

        date = matter['MatterIntroDate']
        title = matter['MatterName']
        identifier = matter['MatterFile']

        if not all((date, title, identifier)):
            return None

        leg_type = BILL_TYPES[matter['MatterTypeName']]

        bill_session = self.sessions(self.toTime(date))

        bill = Bill(identifier=identifier,
                    title=title,
                    classification=leg_type,
                    legislative_session=bill_session,
                    from_organization={"name": "New York City Council"})

        legistar_web = matter['legistar_url']
        legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

        bill.add_source(legistar_web, note='web')
        bill.add_source(legistar_api, note='api')

        if matter['MatterTitle']:
            bill.add_title(matter['MatterTitle'])

        if matter['MatterEXText5']:
            bill.add_abstract(matter['MatterEXText5'], note='')

        try:
            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        for attachment in self.attachments(matter_id):

            if attachment['MatterAttachmentId'] == 103315:  # Duplicate
                return None

            if attachment['MatterAttachmentName']:
                bill.add_document_link(attachment['MatterAttachmentName'],
                                       attachment['MatterAttachmentHyperlink'],
                                       media_type='application/pdf')

        for topic in self.topics(matter_id) :
            bill.add_subject(topic['MatterIndexName'].strip())

        for relation in self.relations(matter_id):
            try:
                related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
            except scrapelib.HTTPError:
                return None
            else:
                date = related_bill['MatterIntroDate']
                related_bill_session = self.session(self.toTime(date))
                identifier = related_bill['MatterFile']
                bill.add_related_bill(identifier=identifier,
                                      legislative_session=related_bill_session,
                                      relation_type='companion')

        try:
            text = self.text(matter_id)
        except KeyError:
            self.version_errors.append(legistar_web)
            return None

        bill.extras['local_classification'] = matter['MatterTypeName']

        if text:
            if text['MatterTextPlain']:
                bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '')

            if text['MatterTextRtf']:
                bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

        return bill
Esempio n. 11
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        hist_url = (f'https://www.legis.iowa.gov/legislation/billTracking/'
                    f'billHistory?billName={bill_id}&ga={session_id}')
        req_session = requests.Session()
        req = requests.get(hist_url)
        if (req.status_code == 500):
            self.warning("500 error on {}, skipping".format(hist_url))
            return

        page = lxml.html.fromstring(req.text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath('string(//div[@id="content"]/div[@class='
                           '"divideVert"]/div/div[4]/div[2])').strip()

        if title == '':
            # Sometimes the title is moved, see
            # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88
            title = page.xpath('string(//div[@id="content"]/div[@class='
                               '"divideVert"]/div[4]/div[2])').strip()
            if title == '':
                self.warning("URL: %s gives us an *EMPTY* bill. Aborting." %
                             url)
                return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if 'HR' in bill_id or 'SR' in bill_id:
            bill_type = ['resolution']
        elif 'HJR' in bill_id or 'SJR' in bill_id:
            bill_type = ['joint resolution']
        elif 'HCR' in bill_id or 'SCR' in bill_id:
            bill_type = ['concurrent resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = 'https://www.legis.iowa.gov/docs/'\
            'publications/LG{}/{}/attachments/{}.html'
        version_pdf_url_template = 'https://www.legis.iowa.gov/docs/'\
            'publications/LG{}/{}/{}.pdf'

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath('string(@value)')

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(' ', ''))

                bill.add_version_link(note=version_name,
                                      url=version_html_url,
                                      media_type='text/html')

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(' ', ''))

                bill.add_version_link(note=version_name,
                                      url=version_pdf_url,
                                      media_type='application/pdf')

        sponsors_str = page.xpath('string(//div[@id="content"]/div[@class='
                                  '"divideVert"]/div/div[4]/div[1])').strip()

        if re.search('^By ', sponsors_str):
            sponsors = re.split(',| and ', sponsors_str.split('By ')[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall(r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)',
                                  sponsors_str)

        for sponsor in sponsors:
            sponsor = sponsor.replace(' and', '').strip(' .,')

            # a few sponsors get mangled by our regex
            sponsor = {
                'Means': 'Ways & Means',
                'Iowa': 'Economic Growth/Rebuild Iowa',
                'Safety': 'Public Safety',
                'Resources': 'Human Resources',
                'Affairs': 'Veterans Affairs',
                'Protection': 'Environmental Protection',
                'Government': 'State Government',
                'Boef': 'De Boef'
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        for tr in page.xpath(
                "//table[contains(@class, 'billActionTable')][1]/tbody/tr"):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[3])").strip()
            action = re.sub(r'\s+', ' ', action)

            # Capture any amendment links.
            links = [
                link
                for link in [version['links'] for version in bill.versions]
            ]
            version_urls = [
                link['url'] for link in [i for sub in links for i in sub]
            ]
            if 'amendment' in action.lower():
                for anchor in tr.xpath('.//a[1]'):
                    if '-' in anchor.text:
                        # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf
                        amd_pattern = 'https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf'
                        amd_id = anchor.text.replace('-', '').strip()
                        amd_url = amd_pattern.format(session_id, amd_id)
                        amd_name = 'Amendment {}'.format(anchor.text.strip())

                        if amd_url not in version_urls:
                            bill.add_version_link(note=amd_name,
                                                  url=amd_url,
                                                  media_type='application/pdf')
                            version_urls.append(amd_url)
                        else:
                            self.info(
                                "Already Added {}, skipping".format(amd_url))

            if 'S.J.' in action or 'SCS' in action:
                actor = 'upper'
            elif 'H.J.' in action or 'HCS' in action:
                actor = 'lower'
            else:
                actor = "legislature"

            action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip()

            if action.startswith('Introduced'):
                atype = ['introduction']
                if ', referred to' in action:
                    atype.append('referral-committee')
            elif action.startswith('Read first time'):
                atype = 'reading-1'
            elif action.startswith('Referred to'):
                atype = 'referral-committee'
            elif action.startswith('Sent to Governor'):
                atype = 'executive-receipt'
            elif action.startswith('Reported Signed by Governor'):
                atype = 'executive-signature'
            elif action.startswith('Signed by Governor'):
                atype = 'executive-signature'
            elif action.startswith('Vetoed by Governor'):
                atype = 'executive-veto'
            elif action.startswith('Item veto'):
                atype = 'executive-veto-line-item'
            elif re.match(r'Passed (House|Senate)', action):
                atype = 'passage'
            elif re.match(r'Amendment (S|H)-\d+ filed', action):
                atype = ['amendment-introduction']
                if ', adopted' in action:
                    atype.append('amendment-passage')
            elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted',
                          action):
                atype = 'amendment-passage'
            elif re.match(r'Amendment (S|N)-\d+ lost', action):
                atype = 'amendment-failure'
            elif action.startswith('Resolution filed'):
                atype = 'introduction'
            elif action.startswith('Resolution adopted'):
                atype = 'passage'
            elif (action.startswith('Committee report')
                  and action.endswith('passage.')):
                atype = 'committee-passage'
            elif action.startswith('Withdrawn'):
                atype = 'withdrawal'
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search(r'END OF \d+ ACTIONS', action):
                continue

            if '$history' not in action:
                bill.add_action(description=action,
                                date=date,
                                chamber=actor,
                                classification=atype)

        self.scrape_subjects(bill, bill_id, session, req_session)

        yield bill
    def scrape(self, window=28):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in self.matters(n_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = matter['legistar_url']

            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
    def scrape(self, chamber, session):
        zip_url = zip_urls[session]

        fname, resp = self.urlretrieve(zip_url)
        self.zf = zipfile.ZipFile(open(fname))
        os.remove(fname)

        # bill basics
        self.bills = {}  # LSR->Bill
        self.bills_by_id = {}  # need a second table to attach votes
        last_line = []
        for line in self.zf.open('tbllsrs.txt').readlines():
            line = line.split('|')
            if len(line) < 36:
                if len(last_line + line[1:]) == 36:
                    # combine two lines for processing
                    # (skip an empty entry at beginning of second line)
                    line = last_line + line
                    self.warning('used bad line')
                else:
                    # skip this line, maybe we'll use it later
                    self.warning('bad line: %s' % '|'.join(line))
                    last_line = line
                    continue
            session_yr = line[0]
            lsr = line[1]
            title = line[2]
            body = line[3]
            expanded_bill_id = line[9]
            bill_id = line[10]

            if body == body_code[chamber] and session_yr == session:
                if expanded_bill_id.startswith('CACR'):
                    bill_type = 'constitutional amendment'
                elif expanded_bill_id.startswith('PET'):
                    bill_type = 'petition'
                elif expanded_bill_id.startswith('AR') and bill_id.startswith(
                        'CACR'):
                    bill_type = 'constitutional amendment'
                else:
                    bill_type = bill_type_map[expanded_bill_id.split(' ')[0]
                                              [1:]]

                if title.startswith('('):
                    title = title.split(')', 1)[1].strip()

                self.bills[lsr] = Bill(legislative_session=session,
                                       chamber=chamber,
                                       identifier=bill_id,
                                       title=title,
                                       classification=bill_type)
                version_url = VERSION_URL % (session,
                                             expanded_bill_id.replace(' ', ''))
                self.bills[lsr].add_version_link(note='latest version',
                                                 url=version_url,
                                                 media_type='text/html')
                self.bills_by_id[bill_id] = self.bills[lsr]

        # load legislators
        self.legislators = {}
        for line in self.zf.open('tbllegislators.txt').readlines():
            line = line.split('|')
            employee_num = line[0]

            # first, last, middle
            if line[3]:
                name = '%s %s %s' % (line[2], line[3], line[1])
            else:
                name = '%s %s' % (line[2], line[1])

            self.legislators[employee_num] = {'name': name, 'seat': line[5]}
            # body = line[4]

        # sponsors
        for line in self.zf.open('tbllsrsponsors.txt').readlines():
            session_yr, lsr, seq, employee, primary = line.strip().split('|')

            if session_yr == session and lsr in self.bills:
                sp_type = 'primary' if primary == '1' else 'cosponsor'
                try:
                    self.bills[lsr].add_sponsorship(
                        classification=sp_type,
                        name=self.legislators[employee]['name'],
                        entity_type='person',
                        primary=True if sp_type == 'primary' else False)
                    self.bills[lsr].extras = {
                        '_code': self.legislators[employee]['seat']
                    }
                except KeyError:
                    self.warning("Error, can't find person %s" % employee)

        # actions
        for line in self.zf.open('tbldocket.txt').readlines():
            # a few blank/irregular lines, irritating
            if '|' not in line:
                continue

            (session_yr, lsr, _, timestamp, bill_id, body, action,
             _) = line.split('|')

            if session_yr == session and lsr in self.bills:
                actor = 'lower' if body == 'H' else 'upper'
                time = dt.datetime.strptime(timestamp, '%m/%d/%Y %H:%M:%S %p')
                action = action.strip()
                atype = classify_action(action)
                self.bills[lsr].add_action(chamber=actor,
                                           description=action,
                                           date=time.strftime("%Y-%m-%d"),
                                           classification=atype)
                amendment_id = extract_amendment_id(action)
                if amendment_id:
                    self.bills[lsr].add_document_link(
                        note='amendment %s' % amendment_id,
                        url=AMENDMENT_URL % amendment_id)

        yield from self.scrape_votes(session, zip_url)

        # save all bills
        for bill in self.bills.values():
            bill.add_source(zip_url)
            yield bill
Esempio n. 14
0
    def scrape(self, window=3):
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in self.matters(n_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(
                matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                responsible_person = action.pop('responsible person')
                act = bill.add_action(**action)

                if responsible_person:
                    act.add_related_entity(
                        responsible_person,
                        'person',
                        entity_id=_make_pseudo_id(name=responsible_person))

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council':
                        act.add_related_entity(
                            body_name,
                            'organization',
                            entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_version_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Esempio n. 15
0
    def scrape_bill(self, chamber, session, bill_id):
        # try and get bill for the first year of the session biennium
        url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
            session[:4],
            bill_id.replace(" ", "-"),
        )
        html = self.get(url).text
        # Otherwise, try second year of the session biennium
        if ("Page Not Found" in html or
                "The bill you are looking for is not available yet" in html):
            url = "http://legislature.mi.gov/doc.aspx?%s-%s" % (
                session[-4:],
                bill_id.replace(" ", "-"),
            )
            html = self.get(url).text
            if ("Page Not Found" in html
                    or "The bill you are looking for is not available yet"
                    in html):
                self.warning(
                    "Cannot open bill page for {}; skipping".format(bill_id))
                return

        doc = lxml.html.fromstring(html)
        doc.make_links_absolute("http://legislature.mi.gov")

        title = doc.xpath(
            '//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content()

        # get B/R/JR/CR part and look up bill type
        bill_type = bill_types[bill_id.split(" ")[0][1:]]

        bill = Bill(bill_id,
                    session,
                    title,
                    chamber=chamber,
                    classification=bill_type)
        bill.add_source(url)

        # sponsors
        sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a')
        for sponsor in sponsors:
            name = sponsor.text.replace(u"\xa0", " ")
            # sometimes district gets added as a link
            if name.isnumeric():
                continue

            if len(sponsors) > 1:
                classification = ("primary"
                                  if sponsor.tail and "primary" in sponsor.tail
                                  else "cosponsor")
            else:
                classification = "primary"
            bill.add_sponsorship(
                name=name.strip(),
                chamber=chamber,
                entity_type="person",
                primary=classification == "primary",
                classification=classification,
            )

        bill.subject = doc.xpath(
            '//span[@id="frg_billstatus_CategoryList"]/a/text()')

        # actions (skip header)
        for row in doc.xpath(
                '//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]:
            tds = row.xpath("td")  # date, journal link, action
            date = tds[0].text_content()
            journal = tds[1].text_content()
            action = tds[2].text_content()
            date = TIMEZONE.localize(
                datetime.datetime.strptime(date, "%m/%d/%Y"))
            # instead of trusting upper/lower case, use journal for actor
            actor = "upper" if "SJ" in journal else "lower"
            classification = categorize_action(action)
            bill.add_action(action,
                            date,
                            chamber=actor,
                            classification=classification)

            # check if action mentions a sub
            submatch = re.search(r"WITH SUBSTITUTE\s+([\w\-\d]+)", action,
                                 re.IGNORECASE)
            if submatch and tds[2].xpath("a"):
                version_url = tds[2].xpath("a/@href")[0]
                version_name = tds[2].xpath("a/text()")[0].strip()
                version_name = "Substitute {}".format(version_name)
                self.info("Found Substitute {}".format(version_url))
                if version_url.lower().endswith(".pdf"):
                    mimetype = "application/pdf"
                elif version_url.lower().endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(version_name,
                                      version_url,
                                      media_type=mimetype)

            # check if action mentions a vote
            rcmatch = re.search(r"Roll Call # (\d+)", action, re.IGNORECASE)
            if rcmatch:
                rc_num = rcmatch.groups()[0]
                # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011
                journal_link = tds[1].xpath("a/@href")
                if journal_link:
                    objectname = journal_link[0].rsplit("=", 1)[-1]
                    chamber_name = {"upper": "Senate", "lower": "House"}[actor]
                    vote_url = BASE_URL + "/documents/%s/Journal/%s/htm/%s.htm" % (
                        session,
                        chamber_name,
                        objectname,
                    )
                    results = self.parse_roll_call(vote_url, rc_num, session)

                    if results is not None:
                        vote_passed = len(results["yes"]) > len(results["no"])
                        vote = VoteEvent(
                            start_date=date,
                            chamber=actor,
                            bill=bill,
                            motion_text=action,
                            result="pass" if vote_passed else "fail",
                            classification="passage",
                        )

                        # check the expected counts vs actual
                        count = re.search(r"YEAS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["yes"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d" %
                                (bill_id, action, count, len(results["yes"])))
                        count = re.search(r"NAYS (\d+)", action, re.IGNORECASE)
                        count = int(count.groups()[0]) if count else 0
                        if count != len(results["no"]):
                            self.warning(
                                "vote count mismatch for %s %s, %d != %d" %
                                (bill_id, action, count, len(results["no"])))

                        vote.set_count("yes", len(results["yes"]))
                        vote.set_count("no", len(results["no"]))
                        vote.set_count("other", len(results["other"]))
                        possible_vote_results = ["yes", "no", "other"]
                        for pvr in possible_vote_results:
                            for name in results[pvr]:
                                if session == "2017-2018":
                                    names = name.split("\t")
                                    for n in names:
                                        vote.vote(pvr, name.strip())
                                else:
                                    # Prevents voter names like "House Bill No. 4451, entitled" and other sentences
                                    if len(name.split()) < 5:
                                        vote.vote(pvr, name.strip())
                        vote.add_source(vote_url)
                        yield vote
                else:
                    self.warning("missing journal link for %s %s" %
                                 (bill_id, journal))

        # versions
        for row in doc.xpath(
                '//table[@id="frg_billstatus_DocumentGridTable"]/tr'):
            parsed = self.parse_doc_row(row)
            if parsed:
                name, url = parsed
                if url.endswith(".pdf"):
                    mimetype = "application/pdf"
                elif url.endswith(".htm"):
                    mimetype = "text/html"
                bill.add_version_link(name, url, media_type=mimetype)

        # documents
        for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)
        for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'):
            document = self.parse_doc_row(row)
            if document:
                name, url = document
                bill.add_document_link(name, url)

        yield bill
Esempio n. 16
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()
        year_slug = session[5:]

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                    "Unknown bill type found: '{}'".
                    format(info['BillNumber'])
                )

            # Create the bill using its basic information
            bill = Bill(
                identifier=info['BillNumber'],
                legislative_session=session,
                chamber=bill_chamber,
                title=info['Title'],
                classification=bill_type
            )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li'
            )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type='person',
                        primary=(sponsor_type == 'primary')
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a'
            )

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf'
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.items()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif actor == 'lower' and any(x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(x.lower().startswith(' aspassed')
                                              or x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    chamber=actor,
                    classification=action_type
                )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if "Passed -- " in vote['FullStatus']:
                    did_pass = True
                elif "Failed -- " in vote['FullStatus']:
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                    bill=info['BillNumber'],
                    bill_chamber=bill_chamber
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Esempio n. 17
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {
                "Senate": "upper",
                "House": "lower",
                "House of Representatives": "lower",
                "house": "lower",
                "senate": "upper"
            }

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {
                "approved": True,
                "passed": True,
                "adopted": True,
                "true": True,
                "false": False,
                "failed": False,
                True: True,
                False: False
            }

            action_dict = {
                "ref_ctte_100": "referral-committee",
                "intro_100": "introduction",
                "intro_101": "introduction",
                "pass_300": "passage",
                "intro_110": "reading-1",
                "refer_210": "referral-committee",
                "crpt_301": None,
                "crpt_317": None,
                "concur_606": "passage",
                "pass_301": "passage",
                "refer_220": "referral-committee",
                "intro_102": ["introduction", "passage"],
                "intro_105": ["introduction", "passage"],
                "intro_ref_ctte_100": "referral-committee",
                "refer_209": None,
                "intro_108": ["introduction", "passage"],
                "intro_103": ["introduction", "passage"],
                "msg_reso_503": "passage",
                "intro_107": ["introduction", "passage"],
                "imm_consid_360": "passage",
                "refer_213": None,
                "adopt_reso_100": "passage",
                "adopt_reso_110": "passage",
                "msg_507": "amendment-passage",
                "confer_713": None,
                "concur_603": None,
                "confer_712": None,
                "msg_506": "amendment-failure",
                "receive_message_100": "passage",
                "motion_920": None,
                "concur_611": None,
                "confer_735": None,
                "third_429": None,
                "final_501": None,
                "concur_608": None,
            }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(
                session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(
                first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url,
                                                     "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url,
                                                      "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url,
                                                      "analysiss")

            for row in self.get_bill_rows(session):
                number_link, _ga, title, primary_sponsor, status = row.xpath(
                    'td')

                bill_id = number_link.text_content()
                title = title.text_content().strip()
                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=title,
                            classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = (
                    'http://search-prod.lis.state.oh.us/solarapi/v1/'
                    'general_assembly_{}/{}/{}/'.format(
                        session, 'bills' if 'B' in bill_id else 'resolutions',
                        bill_id.lower().replace(' ', '')))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url + version["pdfDownloadLink"]
                    bill.add_version_link(version_name,
                                          version_link,
                                          media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(sponsor_name,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification='cosponsor',
                        entity_type='person',
                        primary=False,
                    )

                try:
                    action_doc = self.get(base_url +
                                          bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning(
                                "Unknown action {desc} with code {code}."
                                " Add it to the action_dict"
                                ".".format(desc=action_desc,
                                           code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(
                            datetime.datetime.strptime(action["datetime"],
                                                       "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date,
                                        chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill,
                                  base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill,
                                  base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill,
                                  base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill,
                                  base_url)

                # votes
                vote_url = base_url + bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url, base_url, bill,
                                             legislators, chamber_dict,
                                             vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url, base_url, bill,
                                             legislators, chamber_dict,
                                             vote_results)

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(
                        data["items"][0]["effective_date"], "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(effective_action,
                                    effective_date,
                                    chamber="executive",
                                    classification=["became-law"])

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url + bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url + bill_version["disapprove"][0][
                        "link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError(
                            "Whoa, a disapprove! We've never"
                            " gotten one before."
                            " Go write some code to deal "
                            "with it: {}".format(disapprove_url))

                yield bill
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id, title,
         (prefix, number, active_version)) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data['summary'],
            classification=bill_type,
        )

        if bill_data['summary']:
            bill.add_abstract(bill_data['summary'], note='')

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor'] is not None:
            if bill_data['sponsor']['rules'] is True:
                bill.add_sponsorship(
                    'Rules Committee',
                    entity_type='organization',
                    classification='primary',
                    primary=True,
                )
            elif not bill_data['sponsor']['budget']:
                primary_sponsor = bill_data['sponsor']['member']
                bill.add_sponsorship(
                    primary_sponsor['shortName'],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

                # There *shouldn't* be cosponsors if there is no sponsor.
                cosponsors = bill_active_version['coSponsors']['items']
                for cosponsor in cosponsors:
                    bill.add_sponsorship(
                        cosponsor['shortName'],
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Attach companion bill data.
            bill.add_related_bill(
                companion_bill_id,
                companion_bill_session,
                relation_type='companion',
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(
                action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                action['text'],
                action_date.strftime('%Y-%m-%d'),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources[
            'bill'].format(
                session_year=session, bill_id=bill_id, summary='', detail='')
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                yield self._parse_senate_votes(vote_data, bill, api_url)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.items():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version_link(
                html_version,
                html_url,
                on_duplicate='ignore',
                media_type='text/html',
            )

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version_link(
                pdf_version,
                pdf_url,
                on_duplicate='ignore',
                media_type='application/pdf',
            )

        yield bill
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype
                    )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(
                                link.xpath('string()').strip(),
                                link.attrib['href'],
                                media_type='text/html',
                                on_duplicate='ignore'
                )

        sponsor_links = page.xpath(
            "//td[contains(@id, 'tdSponsors')]/a")
        for link in sponsor_links:
            bill.add_sponsorship(
                    link.text,
                    classification='primary',
                    primary=True,
                    entity_type='person'
                )

        actor = chamber
        use_row = False
        self.debug(bill_id)
        for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"):

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')
            elif action.startswith('Signed by Governor'):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate',
                                      'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match('\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Esempio n. 20
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if ".B. " in bill_id:
            bill_type = "bill"
        elif bill_id.startswith("H.R. ") or bill_id.startswith("S.R. "):
            bill_type = "resolution"
        elif ".C.R. " in bill_id:
            bill_type = "concurrent resolution"
        elif ".J.R. " in bill_id:
            bill_type = "joint resolution"

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub(r"\s+", " ", bill_id).strip()

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            try:
                (title, name) = [
                    x.strip() for x in info.xpath(".//text()") if x.strip()
                ]
            except ValueError:
                self.warning(
                    "Could not find sponsor's name for {}".format(bill_id))
                continue
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification="primary",
                                 entity_type="person",
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ",
                                                  "").replace("Rep. ", "")
            bill.add_sponsorship(
                floor_sponsor,
                classification="cosponsor",
                entity_type="person",
                primary=False,
            )
        else:
            raise AssertionError("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]')

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get("href")
            if not url:
                url = version.xpath("following-sibling::a[1]/@href")[0]

            bill.add_version_link(version.xpath("text()")[0].strip(),
                                  url,
                                  media_type="application/pdf")

        for related in page.xpath(
                '//b[text()="Related Documents "]/following-sibling::ul/li/'
                'a[contains(@class,"nlink")]'):
            href = related.xpath("@href")[0]
            if ".fn.pdf" in href:
                bill.add_document_link("Fiscal Note",
                                       href,
                                       media_type="application/pdf")
            else:
                text = related.xpath("text()")[0]
                bill.add_document_link(text,
                                       href,
                                       media_type="application/pdf")

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        if page.xpath('//div[@id="billStatus"]//table'):
            status_table = page.xpath('//div[@id="billStatus"]//table')[0]
            yield from self.parse_status(bill, status_table, chamber)

        yield bill
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
                             'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count += 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]' +
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=title,
                            classification=bill_type
                            )
                bill.subject = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version_link(note="Bill Text",
                                              url=text_url,
                                              media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(name=leg,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(name=leg,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    # bill.add_document(minutes_date, minutes_url)
                    bill.add_document_link(note=minutes_date,
                                           url=minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
Esempio n. 22
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page, '//div[@class="main-content"]//h2').text.split()

        bill_number = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_number)
            return

        bill_type = 'resolution' if 'LR' in bill_number else 'bill'

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            '//body/div[3]/div[2]/div[2]/div/div[3]/div[1]/ul/li[1]/a[1]/text()'
        )

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                '//body/div[3]/div[2]/div[2]/div/div[2]/div[1]/ul/li[1]/text()'
            )
            introduced_by = introduced_by.split('Introduced By:')[1].strip()

        introduced_by = introduced_by.strip()
        bill.add_sponsorship(
            name=introduced_by,
            entity_type='person',
            primary=True,
            classification='primary',
        )

        action_nodes = self.get_nodes(
            bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(action_node, './td[1]').text
            date = datetime.strptime(date, '%b %d, %Y')

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(action_node, './td[2]').text_content()

            if 'Governor' in action:
                actor = 'executive'
            elif 'Speaker' in action:
                actor = 'legislature'
            else:
                actor = 'legislature'

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime('%Y-%m-%d'),
                chamber=actor,
                classification=action_type,
            )

        # Were in reverse chronological order.
        bill.actions.reverse()

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page, '/html/body/div[3]/div[2]/div[2]/div/'
            'div[3]/div[2]/ul/li/a')

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib['href']
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version_link(version_name,
                                  version_url,
                                  media_type='application/pdf')

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib['href']
            bill.add_document_link(amendment_name, amendment_url)

        self.scrape_amendments(bill, bill_page)

        # Related transcripts.
        transcript_links = self.get_nodes(
            bill_page, '//div[@class="main-content"]/div[5]/div[2]/'
            'div[@class="hidden-xs"]/table/tr/td/a')

        for transcript_link in transcript_links:
            transcript_name = transcript_link.text
            transcript_url = transcript_link.attrib['href']
            bill.add_document_link(transcript_name, transcript_url)

        yield bill

        yield from self.scrape_votes(bill, bill_page, actor)
Esempio n. 23
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "http://legislature.idaho.gov/legislation/%s/" % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name,
                                      url=href,
                                      media_type="application/pdf")
            else:
                bill.add_document_link(note=name,
                                       url=href,
                                       media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + "/" + session[0:4],
                                              "%m/%d/%Y").strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(actor, date, row[2], session,
                                           bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u"\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill
Esempio n. 24
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber='upper',
            legislative_session=year,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note='abstract')
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
            self._parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
            self._parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        yield bill
Esempio n. 25
0
    def parse_bill(self, chamber, session, bill_id, url):
        try:
            page = self.lxmlize(url)
        except scrapelib.HTTPError as e:
            self.logger.warning(e)
            return

        last_action = self.parse_bill_field(page,
                                            "Last Action").xpath("text()")[0]
        if "WITHDRAWN" in last_action.upper():
            self.info("{} Withdrawn, skipping".format(bill_id))
            return

        version = self.parse_bill_field(page, "Bill Documents")
        source_url = version.xpath("a[1]/@href")[0]
        version_title = version.xpath("a[1]/text()")[0].strip()

        if version is None:
            # Bill withdrawn
            self.logger.warning("Bill withdrawn.")
            return
        else:
            if source_url.endswith(".doc"):
                mimetype = "application/msword"
            elif source_url.endswith(".pdf"):
                mimetype = "application/pdf"

        title = self.parse_bill_field(page, "Title").text_content()

        # actions = self.get_nodes(
        #     page,
        #     '//div[@class="StandardText leftDivMargin"]/'
        #     'div[@class="StandardText"][last()]//text()[normalize-space()]')

        if "CR" in bill_id:
            bill_type = "concurrent resolution"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"
        else:
            bill_type = "bill"

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        bill.add_version_link(version_title, source_url, media_type=mimetype)

        self.parse_actions(page, bill, chamber)
        self.parse_subjects(page, bill)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib["href"]
            if source_url.endswith(".doc"):
                mimetype = "application/msword"
            elif source_url.endswith(".pdf"):
                mimetype = "application/pdf"

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath(
                "//td/span/a[contains(@href, 'Legislator-Profile')]"):
            bill.add_sponsorship(
                link.text.strip(),
                classification="primary",
                entity_type="person",
                primary=True,
            )

        if page.xpath("//th[contains(text(),'Votes')]"):
            vote_url = page.xpath(
                "//a[contains(text(),'Vote History')]/@href")[0]
            yield from self.scrape_votes(vote_url, bill, chamber)

        bdr_no = self.parse_bill_field(page, "Bill Request Number")
        if bdr_no.xpath("text()"):
            bdr = bdr_no.xpath("text()")[0].strip()
            bill.extras["BDR"] = bdr

        yield bill
Esempio n. 26
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace('Bill.aspx', 'BillContent.aspx')
        url = url.replace('&code=R', '&code=R&style=new')

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath('//table/tr')
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == 'LR Number:'
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if table_rows[4 + cosponsorOffset][0].text_content().strip() == 'Governor Action:':
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == 'Bill String:'
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[4:])
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = 'Appropriations Bill'
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title
                ))
                return

        bill = Bill(
            bill_id,
            chamber='lower',
            title=bill_desc,
            legislative_session=session,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note='official')

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # check for cosponsors
        sponsors_url, = bill_page.xpath(
            "//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        actions_link, = bill_page.xpath(
            "//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = '%s%s' % (
                self._house_base_url,
                doc_tag[0].attrib['href']
            )
            bill.add_document_link(doc, text_url, media_type='text/html')

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == 'PDF':
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill versions
        # everything between the row containing "Bill Text"" and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[contains(text(),"Bill Text")]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]')
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip()
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version, path, media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill summaries
        # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[contains(text(),"Bill Summary")]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]')

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[contains(text(),"Bill Summary")]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = 'Bill Summary ({})'.format(version)
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_document_link(summary_name, path, media_type=mimetype,
                                       on_duplicate='ignore')

        # house bill amendments
        amendment_rows = bill_page.xpath('//div[contains(text(),"Amendment")]/'
                                         'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip()
            path = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip()
            summary_name = 'Amendment {}'.format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = '{} (Defeated)'.format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = '{} (Adopted)'.format(summary_name)

            distributed_icon = row.xpath('.//img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = '{} (Distributed)'.format(summary_name)

            if '.pdf' in path:
                mimetype = 'application/pdf'
            else:
                mimetype = 'text/html'
            bill.add_version_link(summary_name, path, media_type=mimetype,
                                  on_duplicate='ignore')

        yield bill
Esempio n. 27
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = "https://malegislature.gov/Bills/{}/{}".format(
            session_for_url, bill_id
        )

        try:
            response = self.get(bill_url)
            self.info("GET (with `requests`) - {}".format(bill_url))
        except requests.exceptions.RequestException:
            self.warning(u"Server Error on {}".format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u"Server Error on {}".format(bill_url))
            return False

        # The state website will periodically miss a few bills' titles for a few days
        # These titles will be extant on the bill list page, but missing on the bill detail page
        # The titles are eventually populated
        try:
            bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[
                0
            ]
        except IndexError:
            self.warning("Couldn't find title for {}; skipping".format(bill_id))
            return False

        bill_types = ["H", "HD", "S", "SD", "SRes"]
        if re.sub("[0-9]", "", bill_id) not in bill_types:
            self.warning("Unsupported bill type for {}; skipping".format(bill_id))
            return False

        if "SRes" in bill_id:
            bill_id = bill_id.replace("SRes", "SR")

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification="bill",
        )

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, "summary")

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            "following-sibling::dd/descendant-or-self::*/text()[normalize-space()]"
        )
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(
                sponsor, classification="primary", primary=True, entity_type="person"
            )

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath(
            "//div[contains(@class, 'modalBtnGroup')]/"
            "a[contains(text(), 'Download PDF') and not(@disabled)]/@href"
        )
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link(
                "Bill Text", version_url, media_type="application/pdf"
            )

        # yield back votes and bill
        # XXX  yield from
        self.scrape_actions(bill, bill_url, session)
        yield bill
Esempio n. 28
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info("no session specified, using %s", session)

        self._bill_prefix_map = {
            "HB": {
                "type": "bill",
                "url_segment": "bills/house"
            },
            "HR": {
                "type": "resolution",
                "url_segment": "resolutions/house/simple"
            },
            "HCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "HC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/house/concurrent",
            },
            "HJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/house/joint",
            },
            "SB": {
                "type": "bill",
                "url_segment": "bills/senate"
            },
            "SR": {
                "type": "resolution",
                "url_segment": "resolutions/senate/simple"
            },
            "SCR": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJR": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
            "SC": {
                "type": "concurrent resolution",
                "url_segment": "resolutions/senate/concurrent",
            },
            "SJ": {
                "type": "joint resolution",
                "url_segment": "resolutions/senate/joint",
            },
        }

        api_base_url = "https://api.iga.in.gov"
        proxy = {"url": "http://in-proxy.openstates.org"}

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # sunlight's put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx, char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx] + " " + str(int(bill_id[idx:]))
                break

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill",
                                       session=session,
                                       bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Bill could not be accessed. Skipping.")
                continue

            title = bill_json["description"]
            if title == "NoneNone":
                title = None
            # sometimes description is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning(
                    "Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = ("lower" if bill_json["originChamber"].lower()
                                == "house" else "upper")
            bill_type = self._bill_prefix_map[bill_prefix]["type"]
            bill = Bill(
                disp_bill_id,
                legislative_session=session,
                chamber=original_chamber,
                title=title,
                classification=bill_type,
            )

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                bill.add_sponsorship(
                    classification="author",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=True,
                )

            for s in bill_json["coauthors"]:
                bill.add_sponsorship(
                    classification="coauthor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=False,
                )

            for s in bill_json["sponsors"]:
                bill.add_sponsorship(
                    classification="sponsor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=True,
                )

            for s in bill_json["cosponsors"]:
                bill.add_sponsorship(
                    classification="cosponsor",
                    name=self._get_name(s),
                    entity_type="person",
                    primary=False,
                )

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get("bill_actions",
                                     session=session,
                                     bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace("T", " ")
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if "second reading" in d or "reread second time" in d:
                    action_type.append("reading-2")
                    reading = True

                if "third reading" in d or "reread third time" in d:
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if ("referred" in d and "committee on" in d
                        or "reassigned" in d and "committee on" in d):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning(
                        "Could not recognize an action in '{}'".format(
                            action_desc))
                    action_type = None

                a = bill.add_action(
                    chamber=action_chamber,
                    description=action_desc,
                    date=date,
                    classification=action_type,
                )
                if committee:
                    a.add_related_entity(committee, entity_type="organization")

            # subjects
            subjects = [
                s["entry"] for s in bill_json["latestVersion"]["subjects"]
            ]
            for subject in subjects:
                bill.add_subject(subject)

            # Abstract
            if bill_json["latestVersion"]["digest"]:
                bill.add_abstract(bill_json["latestVersion"]["digest"],
                                  note="Digest")

            # versions and votes
            for version in bill_json["versions"][::-1]:
                try:
                    version_json = client.get(
                        "bill_version",
                        session=session,
                        bill_id=version["billName"],
                        version_id=version["printVersionName"],
                    )
                except scrapelib.HTTPError:
                    self.logger.warning("Bill version does not seem to exist.")
                    continue

                yield from self.deal_with_version(version_json, bill, bill_id,
                                                  original_chamber, session,
                                                  proxy)

            yield bill
Esempio n. 29
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if (transmit_check is not None
                and 'has not been transmitted' in transmit_check.strip()):
            self.logger.debug('Bill has not been transmitted to other chamber '
                              '... skipping {0}'.format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc,
            '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
        )
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc, '//a[text()[contains(.,"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(
                long_desc_page, '//h1/'
                'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {
            'F': 'bill',
            'R': 'resolution',
            'C': 'concurrent resolution'
        }[bill_id[1]]
        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        for subject in self._subject_mapping[bill_id]:
            bill.add_subject(subject)

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
                              '/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        yield bill
Esempio n. 30
0
    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime('%m/%d/%y')
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(version_name,
                                        version_url_pdf,
                                        media_type='application/pdf',
                                        date=version_date)

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsorship(
                        author.name,
                        classification=SPONSOR_TYPES[author.contribution],
                        primary=author.primary_author_flg == 'Y',
                        entity_type='person',
                    )
                    # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on',
                             action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = '(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime('%Y-%m-%d'),
                    chamber=actor,
                    classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(committee,
                                              entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = ('http://leginfo.legislature.ca.gov/faces'
                              '/billVotesClient.xhtml?bill_id={}').format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()