Esempio n. 1
0
    def scrape_bill(self, row, chamber, session):
        bill_id = row['LegislationNumber']

        # TODO: re-evaluate if these should be separate bills
        if 'SA' in bill_id or 'HA' in bill_id:
            self.warning('skipping amendment %s', bill_id)
            return

        bill_type = self.classify_bill(bill_id)
        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=row['LongTitle'],
                    classification=bill_type)
        if row['Synopsis']:
            bill.add_abstract(row['Synopsis'], 'synopsis')
        if row['ShortTitle']:
            bill.add_title(row['ShortTitle'], 'short title')
        if row['SponsorPersonId']:
            self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
            row['LegislationId']
        )
        bill.add_source(html_url, note='text/html')

        html = self.lxmlize(html_url)

        # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
        additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
                                         '/following-sibling::div/a/@href')
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')

        # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
        cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
                                'following-sibling::div/a/@href')
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')

        versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = 'Bill Text'
            # on_duplicate='error'
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row['LegislationId'])
        yield from self.scrape_votes(bill, row['LegislationId'], session)

        yield bill
Esempio n. 2
0
    def handle_list_item(self, item):
        bill_id = item.text.strip()
        title = item.xpath("string(../following-sibling::td[1])").strip()
        sponsor = item.xpath("string(../following-sibling::td[2])").strip()
        bill_url = item.attrib['href'] + '/ByCategory'

        if bill_id.startswith(('SB ', 'HB ', 'SPB ', 'HPB ')):
            bill_type = 'bill'
        elif bill_id.startswith(('HR ', 'SR ')):
            bill_type = 'resolution'
        elif bill_id.startswith(('HJR ', 'SJR ')):
            bill_type = 'joint resolution'
        elif bill_id.startswith(('SCR ', 'HCR ')):
            bill_type = 'concurrent resolution'
        elif bill_id.startswith(('SM ', 'HM ')):
            bill_type = 'memorial'
        else:
            raise ValueError('Failed to identify bill type.')

        bill = Bill(bill_id, self.kwargs['session'], title,
                    chamber='lower' if bill_id[0] == 'H' else 'upper',
                    classification=bill_type)
        bill.add_source(bill_url)

        # normalize id from HB 0004 to H4
        subj_bill_id = re.sub('(H|S)\w+ 0*(\d+)', r'\1\2', bill_id)
        bill.subject = list(self.kwargs['subjects'][subj_bill_id])

        sponsor = re.sub(r'^(?:Rep|Sen)\.\s', "", sponsor)
        for sp in sponsor.split(', '):
            bill.add_sponsorship(sp, 'primary', 'person', True)

        yield from self.scrape_page_items(BillDetail, url=bill_url, obj=bill)

        yield bill
Esempio n. 3
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \
                        'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber])
        response = self.get(bill_json_url)
        page = json.loads(response.content.decode('utf-8'))

        if not page:
            self.warning('null page for %s', bill_id)
            return

        bill_title = page['ShortTitle']
        bill_id = page['Number']
        internal_id = page['BillId']
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        self.scrape_actions(bill, page, chamber)
        self.scrape_versions_and_documents(bill, internal_id)
        self.scrape_sponsors(bill, internal_id)
        self.scrape_subjects(bill, internal_id)
        yield from self.scrape_votes(bill, page)

        bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format(
                    internal_id, session_id)
        bill.add_source(bill_url)

        self.sort_bill_actions(bill)

        yield bill
Esempio n. 4
0
 def get_bill(self, bill_id, **kwargs):
     url = kwargs.pop('url')
     agenda_item = kwargs.pop('agenda_item')
     _type = self.get_type(bill_id)
     bill = Bill(bill_id, self.session, type=_type, **kwargs)
     bill.add_source(url, note='detail')
     return bill
Esempio n. 5
0
    def scrape(self):

        for i, page in enumerate(self.searchLegislation()) :
            for legislation_summary in self.parseSearchResults(page) :
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                if legislation_summary['Type'].lower() in ('order', 
                                                           'claim', 
                                                           'communication', 
                                                           'report', 
                                                           'oath of office') :
                    continue
                else :
                    bill_type = legislation_summary['Type'].lower()

                bill_session = self.session(legislation_summary['Intro\xa0Date'])

                bill = Bill(identifier=legislation_summary['Record #'],
                            legislative_session=bill_session,
                            title=title,
                            classification=bill_type,
                            from_organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['url'])

                bill, votes = self.addDetails(bill, legislation_summary['url'])

                yield bill
                for vote in votes :
                    yield vote
Esempio n. 6
0
    def scrape_bill(self, chamber, session, bill_id, session_id):
        """
        Scrapes documents, actions, vote counts and votes for
        a given bill.
        """
        bill_json_url = 'https://apps.azleg.gov/api/Bill/?billNumber={}&sessionId={}&' \
                        'legislativeBody={}'.format(bill_id, session_id, self.chamber_map[chamber])
        response = self.get(bill_json_url)
        # print(response.content)
        page = json.loads(response.content.decode('utf-8'))

        bill_title = page['ShortTitle']
        bill_id = page['Number']
        internal_id = page['BillId']
        bill_type = self.get_bill_type(bill_id)
        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill = self.scrape_actions(bill, page, chamber)
        bill = self.scrape_versions(bill, internal_id)
        bill = self.scrape_sponsors(bill, internal_id)
        bill = self.scrape_subjects(bill, internal_id)

        bill_url = 'https://apps.azleg.gov/BillStatus/BillOverview/{}?SessionId={}'.format(
                    internal_id, session_id)
        bill.add_source(bill_url)

        bill = self.sort_bill_actions(bill)

        yield bill
Esempio n. 7
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=bill_title, classification='bill')

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(sponsor, classification='primary', primary=True,
                                 entity_type='person')

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
                             "a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link('Bill Text', version_url, media_type='application/pdf')

        # yield back votes and bill
        yield from self.scrape_actions(bill, bill_url, session)
        yield bill
Esempio n. 8
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
Esempio n. 9
0
 def get_bill(self, bill_id, **kwargs):
     if bill_id == '1':
         assert kwargs == {'extra': 'param'}
         raise self.ContinueScraping
     else:
         assert bill_id == '2'
         assert kwargs == {}
         b = Bill('1', self.session, 'title')
         b.add_source('http;//example.com')
         return b
Esempio n. 10
0
def toy_bill():
    b = Bill(
        identifier="HB 2017",
        legislative_session="2012A",
        title="A bill for an act to raise the cookie budget by 200%",
        from_organization="Foo Senate",
        classification="bill",
    )
    b.add_source("http://uri.example.com/", note="foo")
    return b
Esempio n. 11
0
    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        title = doc.xpath('//h3[@class="h3billright"]')[0].text_content()
        # TODO: grab summary (none present at time of writing)

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(
            bill_id, legislative_session=session, chamber=chamber, title=title,
            classification=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'

        for sponsor in re.split(', (?:and )?', sponsors):
            sponsor = sponsor.strip()
            if not sponsor:
                continue
            bill.add_sponsorship(
                sponsor,
                sponsor_type,
                primary=sponsor_type == 'primary',
                entity_type='person',
            )
            sponsor_type = 'cosponsor'

        # subjects
        subject_list = []
        for heading in ('Broad Subject(s):', 'Narrow Subject(s):'):
            subjects = _get_td(doc, heading).xpath('a/text()')
            subject_list += [s.split(' -see also-')[0] for s in subjects if s]
        bill.subject = subject_list

        # documents
        yield from self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        yield bill
Esempio n. 12
0
    def get_bill(self, bill_id, **kwargs):
        url = 'http://www.denvergov.org/sirepub/item.aspx?itemid=%s' % bill_id
        self.urls.add(detail=url)

        bill_id = kwargs.pop('number')
        bill = Bill(bill_id, self.session, kwargs['title'], 'butt',
                    type=['bills'])
        bill.add_source(url, note='detail')

        xpath = '//table[contains(@class, "history")]/tr'
        for tr in self.urls.detail.xpath(xpath):
            import pdb; pdb.set_trace()

        return bill
Esempio n. 13
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=[bill_type])
        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(note=version['note'],
                                      url=version['url'],
                                      media_type=version['media_type'])
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(note=document['note'],
                                       url=document['url'],
                                       media_type=document['media_type'])
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Esempio n. 14
0
    def scrape_bill(self, session, session_slug, chamber, url):
        page = lxml.html.fromstring(self.get(url).text)
        bill_no = page.xpath('//*[@id="item-header"]/text()')[0].strip()
        # state bill id
        internal_id = re.search(r'\/Bill\/(\d+)\/Overview', url).group(1)

        # bill data gets filled in from another call
        bill_data_base = 'https://www.leg.state.nv.us/App/NELIS/REL/{}/Bill/' \
            'FillSelectedBillTab?selectedTab=Overview&billKey={}&_={}'
        bill_data_url = bill_data_base.format(
            session_slug, internal_id, time.time() * 1000)

        bill_page = lxml.html.fromstring(self.get(bill_data_url).text)

        short_title = self.get_header_field(bill_page, 'Summary:').text
        short_title = short_title.replace(u'\u00a0', ' ')

        bill = Bill(
            identifier=bill_no,
            legislative_session=session,
            title=short_title,
            chamber=chamber
        )

        long_title = self.get_header_field(bill_page, 'Title:').text
        if long_title is not None:
            bill.add_abstract(long_title, 'Summary')

        sponsor_div = self.get_header_field(bill_page, 'Primary Sponsor')
        if sponsor_div is not None:
            self.add_sponsors(sponsor_div, bill, 'primary')

        cosponsor_div = self.get_header_field(bill_page, 'Co-Sponsor')
        if cosponsor_div is not None:
            self.add_sponsors(cosponsor_div, bill, 'cosponsor')

        self.add_actions(bill_page, bill, chamber)
        self.add_versions(session_slug, internal_id, bill)

        bill.subject = list(set(self.subject_mapping[bill_no]))

        bdr = self.extract_bdr(short_title)
        if bdr:
            bill.extras['BDR'] = bdr

        bill.extras['NV_ID'] = internal_id

        bill.add_source(url)
        yield bill
Esempio n. 15
0
    def scrape_bill(self, bill_page_url):
        bill_page = lxml.html.fromstring(self.get(bill_page_url).text)

        title = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SubjectLabel"]/text()')
        if title:
            title = title[0]
        else:
            self.warning('Missing bill title {}'.format(bill_page_url))
            return False

        bill_no = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/a/text()')
        if bill_no:
            bill_no = bill_no[0]
        else:
            bill_no = bill_page.xpath(
                '//span[@id="ctl00_ContentPlaceHolder_BillNumberLabel"]/text()')
            if bill_no:
                bill_no = bill_no[0]
            else:
                self.error('Missing bill number {}'.format(bill_page_url))
                return False

        bill = Bill(
            bill_no,
            legislative_session=self.session,
            chamber='legislature',
            title=title,
            classification='bill'
        )

        bill.add_source(bill_page_url)

        self.parse_versions(bill, bill_page, bill_no)
        self.parse_acts(bill, bill_page)

        sponsors = bill_page.xpath('//span[@id="ctl00_ContentPlaceHolder_SponsorsLabel"]/text()')
        if sponsors:
            self.assign_sponsors(bill, sponsors[0], 'primary')

        cosponsors = bill_page.xpath(
            '//span[@id="ctl00_ContentPlaceHolder_CoSponsorsLabel"]/text()')
        if cosponsors:
            self.assign_sponsors(bill, cosponsors[0], 'cosponsor')

        self.parse_date_actions(bill, bill_page)
        self.parse_actions(bill, bill_page)

        yield bill
Esempio n. 16
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.get(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if chamber not in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(identifier=bill_id,
                        legislative_session=session,
                        title=row['bill_title'],
                        classification=bill_type,
                        chamber=chamber)
            bill.add_source(info_url)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsorship(name=str(introducer),
                                     classification='primary',
                                     primary=True,
                                     entity_type='person')

            try:
                for subject in self._subjects[bill_id]:
                    bill.subject.append(subject)

                self.bills[bill_id] = [bill, chamber]

                yield from self.scrape_bill_page(bill)
            except SkipBill:
                self.warning('no such bill: ' + bill_id)
                pass
Esempio n. 17
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Esempio n. 18
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1)

        if type_abbr == 'B':
            btype = ['bill']
        elif type_abbr == 'R':
            btype = ['resolution']

        bill_id = "%s%s %s" % (utils.bill_abbr(chamber), type_abbr, bill_num)

        url = utils.info_url(chamber, session, special, type_abbr, bill_num)
        page = self.get(url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = '/'.join([
            '//div[contains(@class, "BillInfo-ShortTitle")]',
            'div[@class="BillInfo-Section-Data"]',
        ])
        title = page.xpath(xpath).pop().text_content().strip()
        if not title:
            return
        bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
                    classification=btype)
        bill.add_source(url)

        self.parse_bill_versions(bill, page)

        self.parse_history(bill, chamber, utils.history_url(chamber, session, special,
                           type_abbr, bill_num))

        # only fetch votes if votes were seen in history
        # if vote_count:
        yield from self.parse_votes(
            bill,
            utils.vote_url(chamber, session, special, type_abbr, bill_num),
        )

        # Dedupe sources.
        sources = bill.sources
        for source in sources:
            if 1 < sources.count(source):
                sources.remove(source)

        yield bill
Esempio n. 19
0
    def scrape(self):
        self.session = '2011'

        for i, page in enumerate(self.searchLegislation()) :
            for legislation_summary in self.parseSearchResults(page) :
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                bill = Bill(name=legislation_summary['Record #'],
                            session=self.session,
                            title=title,
                            type=[legislation_summary['Type'].lower()],
                            organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['URL'])

                legislation_details = self.expandLegislationSummary(legislation_summary)

                for related_bill in legislation_details.get('Related files', []) :
                    bill.add_related_bill(name = related_bill,
                                          session = self.session,
                                          relation='other-session',
                                          chamber=None)

                for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
                    if i == 0 :
                        primary = True
                        sponsorship_type = "Primary"
                    else :
                        primary = False
                        sponsorship_type = "Regular"

                    bill.add_sponsor(sponsor, sponsorship_type,
                                     'person', primary)

                for subject in legislation_details.get(u'Topics', []) :
                    bill.add_subject(subject)

                for attachment in legislation_details.get(u'Attachments', []) :
                    bill.add_version_link('PDF',
                                          attachment['url'],
                                          mimetype="application/pdf")


                yield bill
Esempio n. 20
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" % (
                               self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Esempio n. 21
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.get(url).text
        bill_page = lxml.html.fromstring(bill_html)

        qs = dict(urlparse.parse_qsl(urlparse.urlparse(url).query))
        bill_id = '{}{}'.format(qs['billtype'], qs['billnumber'])
        versions = bill_page.xpath("//table[contains(@id, 'GridViewVersions')]")[0]

        metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta = self.parse_bill_metainf_table(metainf_table)

        subs = [s.strip() for s in meta['Report Title'].split(";")]
        if "" in subs:
            subs.remove("")
        b = Bill(bill_id, session, meta['Measure Title'],
                 chamber=chamber,
                 classification=bill_type)
        if meta['Description']:
            b.add_abstract(meta['Description'], 'description')
        for subject in subs:
            b.add_subject(subject)
        if url:
            b.add_source(url)

        prior_session = '{} Regular Session'.format(str(int(session[:4]) - 1))
        companion = meta['Companion'].strip()
        if companion:
            b.add_related_bill(identifier=companion.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        prior = bill_page.xpath(
            "//table[@id='ContentPlaceHolderCol1_GridViewStatus']/tr/td/font/text()")[-1]
        if 'carried over' in prior.lower():
            b.add_related_bill(identifier=bill_id.replace(u'\xa0', ' '),
                               legislative_session=prior_session,
                               relation_type="companion")
        for sponsor in meta['Introducer(s)']:
            b.add_sponsorship(sponsor, 'primary', 'person', True)
        versions = self.parse_bill_versions_table(b, versions)
        yield from self.parse_bill_actions_table(b, action_table, bill_id, session, url, chamber)
        yield b
Esempio n. 22
0
    def createBill(self, agenda_item):
        title = agenda_item['Title'].replace('\n', ' ')
        title, primary_role, primary_sponsor, secondary_role, secondary_sponsor = re.match(agenda_item_title_re, title).groups()

        bill = {
            'identifier': agenda_item['Item No.'],
            'title': title,
            'legislative_session': agenda_item['session'],
            # TODO: Add agenda_item type to OCD
            'classification': 'bill',
            'from_organization': {'name': self.jurisdiction.name},
        }

        b = Bill(**bill)
        b.add_source(agenda_item['url'], note='web')

        if primary_sponsor and secondary_sponsor:
            b.add_sponsorship(primary_sponsor, 'mover', 'person', True)
            b.add_sponsorship(secondary_sponsor, 'seconder', 'person', False)

        return b
Esempio n. 23
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text, a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
Esempio n. 24
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath('//div[contains(@class,"field-name-field-bill-number")]'
                                 '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()
        bill = Bill(
                    bill_number,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
            )
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')
        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_amendments(bill, page)
        yield bill
        yield from self.scrape_votes(bill, page)
Esempio n. 25
0
    def handle_page(self):
        bills = self.doc.xpath('//ul[@class="linkSect"]/li')
        for bill in bills:
            link = bill.getchildren()[0]
            bill_id = str(link.text_content())

            if not bill_id.startswith(('S', 'H')):
                continue

            # create a bill
            desc = bill.xpath('text()')[0].strip()
            chamber = {
                'H': 'lower',
                'S': 'upper',
            }[bill_id[0]]
            bill_type = {
                'B': 'bill',
                'J': 'joint resolution',
                'R': 'resolution'
            }[bill_id[1]]
            bill = Bill(bill_id, self.kwargs['session'], desc,
                        chamber=chamber, classification=bill_type)

            bill_url = link.get('href')
            sponsor_url = BASE_URL + URL_PATTERNS['sponsors'].format(
                self.kwargs['session_id'],
                bill_id.replace(' ', ''),
            )

            list(self.scrape_page_items(BillSponsorPage, url=sponsor_url, obj=bill))
            yield from self.scrape_page_items(BillDetailPage, url=bill_url, obj=bill)
            bill.subject = self.kwargs['subjects'][bill_id]
            bill.add_source(bill_url)
            yield bill

        next_url = self.doc.xpath('//a/b[text()="More..."]/../@href')
        if next_url:
            yield from self.scrape_page_items(BillListPage, url=next_url[0], **self.kwargs)
Esempio n. 26
0
    def scrape_bill_2012(self, chamber, session, bill_id, url):
        html = self.get(url).text
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # find <a name="Title">, get parent dt, get parent dl, then dd n dl
        title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

        summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']

        bill = Bill(
            bill_id,
            legislative_session=session,
            classification=_type,
            chamber=chamber,
            title=title,
        )
        bill.add_abstract(summary, note='summary')
        bill.add_source(url)

        self.parse_bill_sponsors(doc, bill)     # sponsors
        self.parse_bill_actions(doc, bill)      # actions
        self.parse_bill_documents(doc, bill)    # documents and versions
        yield from self.parse_bill_votes(doc, bill)        # votes

        # subjects
        subjects = []
        for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
            subjects.append(subj.text.split('-see also-')[0])
        bill.subject = subjects

        # add bill to collection
        self.save_bill(bill)
Esempio n. 27
0
    def _recursively_process_bills(
            self, request_session, chamber, session, first_item=1):
        '''
        Once a search has been initiated, this function will save a
        Bill object for every Paper from the given chamber
        '''

        url = 'http://legislature.maine.gov/LawMakerWeb/searchresults.asp'
        r = request_session.get(url, params={'StartWith': first_item})
        r.raise_for_status()

        bills = lxml.html.fromstring(r.text).xpath('//tr/td/b/a')
        if bills:
            for bill in bills:
                bill_id_slug = bill.xpath('./@href')[0]
                bill_url = 'http://legislature.maine.gov/LawMakerWeb/{}'.format(bill_id_slug)
                bill_id = bill.text[:2] + " " + bill.text[2:]

                bill = Bill(
                    identifier=bill_id,
                    legislative_session=session,
                    title="",
                    chamber=chamber,
                )
                bill.add_source(bill_url)

                yield from self.scrape_bill(bill, chamber)
                yield bill

            # Make a recursive call to this function, for the next page
            PAGE_SIZE = 25
            yield from self._recursively_process_bills(
                request_session=request_session,
                chamber=chamber,
                session=session,
                first_item=first_item + PAGE_SIZE
            )
Esempio n. 28
0
    def scrape(self):
        for leg_summary in self.legislation(
                created_after=datetime.datetime(2014, 1, 1)):
            leg_type = BILL_TYPES[leg_summary['Type']]

            bill = Bill(identifier=leg_summary['File\xa0#'],
                        title=leg_summary['Title'],
                        legislative_session=None,
                        classification=leg_type,
                        from_organization={"name": "New York City Council"})
            bill.add_source(leg_summary['url'], note='web')

            leg_details = self.legDetails(leg_summary['url'])
            history = self.history(leg_summary['url'])

            bill.add_title(leg_details['Name'],
                           note='created by administrative staff')

            if 'Summary' in leg_details:
                bill.add_abstract(leg_details['Summary'], note='')

            if leg_details['Law number']:
                bill.add_identifier(leg_details['Law number'],
                                    note='law number')

            for sponsorship in self._sponsors(leg_details.get('Sponsors', [])):
                sponsor, sponsorship_type, primary = sponsorship
                bill.add_sponsorship(sponsor, sponsorship_type, 'person',
                                     primary)

            for attachment in leg_details.get('Attachments', []):
                if attachment['label']:
                    bill.add_document_link(attachment['label'],
                                           attachment['url'],
                                           media_type="application/pdf")

            history = list(history)

            if history:
                earliest_action = min(
                    self.toTime(action['Date']) for action in history)

                bill.legislative_session = self.sessions(earliest_action)
            else:
                bill.legislative_session = str(self.SESSION_STARTS[0])

            for action in history:
                action_description = action['Action']
                if not action_description:
                    continue

                action_class = ACTION_CLASSIFICATION[action_description]

                action_date = self.toDate(action['Date'])
                responsible_org = action['Action\xa0By']
                if responsible_org == 'City Council':
                    responsible_org = 'New York City Council'
                elif responsible_org == 'Administration':
                    responsible_org = 'Mayor'

                if responsible_org == 'Town Hall Meeting':
                    continue
                else:
                    act = bill.add_action(
                        action_description,
                        action_date,
                        organization={'name': responsible_org},
                        classification=action_class)

                if 'url' in action['Action\xa0Details']:
                    action_detail_url = action['Action\xa0Details']['url']
                    if action_class == 'committee-referral':
                        action_details = self.actionDetails(action_detail_url)
                        referred_committee = action_details[
                            'Action text'].rsplit(' to the ', 1)[-1]
                        act.add_related_entity(
                            referred_committee,
                            'organization',
                            entity_id=_make_pseudo_id(name=referred_committee))
                    result, votes = self.extractVotes(action_detail_url)
                    if result and votes:
                        action_vote = VoteEvent(
                            legislative_session=bill.legislative_session,
                            motion_text=action_description,
                            organization={'name': responsible_org},
                            classification=action_class,
                            start_date=action_date,
                            result=result,
                            bill=bill)
                        action_vote.add_source(action_detail_url, note='web')

                        for option, voter in votes:
                            action_vote.vote(option, voter)

                        yield action_vote

            text = self.text(leg_summary['url'])

            if text:
                bill.extras = {
                    'local_classification': leg_summary['Type'],
                    'full_text': text
                }
            else:
                bill.extras = {'local_classification': leg_summary['Type']}

            yield bill
Esempio n. 29
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//span[text()="Title"]')[0].getparent()
        if title:
            title = title[1].text.strip().strip('"')
        else:
            self.warning("skipping bill {url}, no information")
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = (doc.xpath('//span[contains(text(), "Sponsor(S)")]')
                     [0].getparent()[1].text)
        sponsors_match = re.match(r"(SENATOR|REPRESENTATIVE)", spons_str)
        if sponsors_match:
            sponsors = spons_str.split(",")
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0].split()[1],
                    entity_type="person",
                    classification="primary",
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type="person",
                        classification="cosponsor",
                        primary=False,
                    )

        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r" BY REQUEST OF THE GOVERNOR$", spons_str):
                spons_str = re.sub(r" BY REQUEST OF THE GOVERNOR$", "",
                                   spons_str).title()
                spons_str = spons_str + " Committee (by request of the governor)"

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type="person",
                    classification="primary",
                    primary=True,
                )

        # Get actions
        self._current_comm = None
        act_rows = doc.xpath("//div[@id='tab6_4']//tr")[1:]
        for row in act_rows:
            date, journal, action = row.xpath("td")
            action = action.text_content().strip()
            raw_chamber = action[0:3]
            journal_entry_number = journal.text_content()
            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  "%m/%d/%Y")
            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            # Votes
            if re.search(r"Y(\d+)", action):
                vote_href = journal.xpath(".//a/@href")
                if vote_href:
                    vote_href = vote_href[0].replace(" ", "")
                    yield from self.parse_vote(
                        bill,
                        journal_entry_number,
                        action,
                        act_chamber,
                        act_date,
                        vote_href,
                    )

            action, atype = self.clean_action(action)

            match = re.search(r"^Prefile released (\d+/\d+/\d+)$", action)
            if match:
                action = "Prefile released"
                act_date = datetime.datetime.strptime(match.group(1),
                                                      "%m/%d/%y")

            bill.add_action(
                action,
                chamber=act_chamber,
                date=act_date.strftime("%Y-%m-%d"),
                classification=atype,
            )

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions - to do
        text_list_url = (
            f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab1_4"
        )
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "/Text/")]'):
            name = link.text_content()
            text_url = link.get("href")
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents - to do
        doc_list_url = (
            f"https://www.akleg.gov/basis/Bill/Detail/{session}?Root={bill_id}#tab5_4"
        )
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib["href"]
            if h_name.strip():
                try:
                    bill.add_document_link(h_name, h_href)
                except KeyError:
                    self.warning("Duplicate found")
                    return

        yield bill
Esempio n. 30
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if ' ' in bill_id:
            bill_id = bill_id[0] + bill_id.split(' ')[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == 'Senate':
            chamber = 'upper'
        elif chamber == 'House':
            chamber = 'lower'

        bill_detail_url = (
            'http://www.ncleg.net/gascripts/'
            'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all'
        ) % (session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0]
        if 'Joint Resolution' in title_div_txt:
            bill_type = 'joint resolution'
            bill_id = bill_id[0] + 'JR ' + bill_id[1:]
        elif 'Resolution' in title_div_txt:
            bill_type = 'resolution'
            bill_id = bill_id[0] + 'R ' + bill_id[1:]
        elif 'Bill' in title_div_txt:
            bill_type = 'bill'
            bill_id = bill_id[0] + 'B ' + bill_id[1:]

        bill_title = doc.xpath(
            '/html/body/div/div/main/div[2]/div[contains(@class,"col-12")]/a'
        )[0]
        bill_title = bill_title.text_content().strip()

        bill = Bill(bill_id,
                    legislative_session=session,
                    title=bill_title,
                    chamber=chamber,
                    classification=bill_type)
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == 'lower':
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace(u'\xa0', ' ')
            version_url = vlink.attrib['href']

            media_type = 'text/html'
            if version_url.lower().endswith(".pdf"):
                media_type = 'application/pdf'

            bill.add_version_link(version_name,
                                  version_url,
                                  media_type=media_type,
                                  on_duplicate='ignore')

        # sponsors
        spon_row = doc.xpath(
            '//div[contains(text(), "Sponsors")]/following-sibling::div')[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = 'primary'
        for leg in spon_row.text_content().split(';'):
            name = leg.replace(u'\xa0', ' ').strip()
            if name.startswith('(Primary)'):
                name = name.replace('(Primary)', '').strip()
                spon_type = 'cosponsor'
            if not name:
                continue
            bill.add_sponsorship(name,
                                 classification=spon_type,
                                 entity_type='person',
                                 primary=(spon_type == 'primary'))

        # keywords
        kw_row = doc.xpath(
            '//div[contains(text(), "Keywords:")]/following-sibling::div')[0]
        for subject in kw_row.text_content().split(', '):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = ('//h6[contains(text(), "History")]'
                           '/ancestor::div[contains(@class, "gray-card")]'
                           '//div[contains(@class, "card-body")]'
                           '/div[@class="row"]')
        # skip two header rows
        for row in doc.xpath(action_tr_xpath):
            cols = row.xpath('div')
            act_date = cols[1].text
            actor = cols[3].text or ''
            # if text is blank, try diving in
            action = (cols[5].text
                      or '').strip() or cols[5].text_content().strip()

            act_date = dt.datetime.strptime(act_date,
                                            '%m/%d/%Y').strftime('%Y-%m-%d')

            if actor == 'Senate':
                actor = 'upper'
            elif actor == 'House':
                actor = 'lower'
            else:
                actor = 'executive'

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None

            bill.add_action(action,
                            act_date,
                            chamber=actor,
                            classification=atype)

        # TODO: Fix vote scraper
        # yield from self.scrape_votes(bill, doc)

        yield bill
Esempio n. 31
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
            (sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
            (subject, ) = bill_info.xpath('td[3]//text()')
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if 'B' in bill_id:
                bill_type = 'bill'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title='',
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
                        'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning(
                    "Bill {} has no webpage, and will be skipped".format(
                        bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if (bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]')):
                title = bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'
                )[0].text_content().strip()
            if not title:
                title = "[No title given by state]"
            bill.title = title

            version_url_base = (
                'http://alisondb.legislature.state.al.us/ALISON/'
                'SearchableInstruments/{0}/PrintFiles/{1}-'.format(
                    self.session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + 'int.pdf'
                elif version == "Engrossed":
                    version_url = version_url_base + 'eng.pdf'
                elif version == "Enrolled":
                    version_url = version_url_base + 'enr.pdf'
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type='application/pdf',
                    on_duplicate='ignore',
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath('td[1]')[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT)
                bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type,
                    bir.xpath('td[3]/font/text()')[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification='other',
                )

                try:
                    (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value')
                except ValueError:
                    bir_vote_id = ''

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text)

            actions = bill_doc.xpath(
                '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath('td[1]/font/text()')[0].encode(
                        'ascii', 'ignore').strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath('td[2]/font/text()')

                possible_amendment = action.xpath('td[3]/font/u/text()')
                if len(possible_amendment
                       ) > 0 and not possible_amendment[0].strip() == '':
                    (amendment, ) = possible_amendment
                else:
                    amendment = None

                (action_text, ) = action.xpath('td[4]/font/text()')

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = re.search(
                        r'.*? referred to the .*? committee on (.*?)$',
                        action_text).group(1).strip()
                except AttributeError:
                    action_committee = ''

                if action_date is not None:
                    act = bill.add_action(
                        action_text,
                        TIMEZONE.localize(action_date),
                        chamber=actor,
                        classification=action_type,
                    )
                    if action_committee:
                        act.add_related_entity(action_committee,
                                               entity_type='organization')

                    try:
                        vote_button = action.xpath('td[9]//text()')[0].strip()
                    except IndexError:
                        vote_button = ''

                    if vote_button.startswith("Roll "):
                        vote_id = vote_button.split(" ")[-1]

                        yield from self.scrape_vote(
                            bill=bill,
                            vote_chamber=action_chamber,
                            bill_id=bill_id,
                            vote_id=vote_id,
                            vote_date=TIMEZONE.localize(action_date),
                            action_text=action_text)

                if amendment:
                    amend_url = (
                        'http://alisondb.legislature.state.al.us/ALISON/'
                        'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format(
                            self.session, amendment))

                    amend_name = 'Amd/Sub {}'.format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type='application/pdf',
                        on_duplicate='ignore',
                    )

            yield bill
Esempio n. 32
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                    "Unknown bill type found: '{}'".
                    format(info['BillNumber'])
                )

            bill_id = info['BillNumber'].replace('.', '').replace(' ', '')
            # put one space back in between type and number
            bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info['Title'],
                classification=bill_type
            )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li'
            )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type='person',
                        primary=(sponsor_type == 'primary')
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a'
            )

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf'
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v.strip() for k, v in action.items()}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    # assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif "Vetoed by the Governor" in action['FullStatus']:
                    action_type = 'executive-veto'
                elif "Read first time" in action['FullStatus'] \
                        or "Read 1st time" in action['FullStatus']:
                    action_type = 'introduction'
                elif "Reported favorably" in action['FullStatus']:
                    action_type = 'committee-passage-favorable'
                elif actor == 'lower' and any(x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(x.lower().startswith(' aspassed')
                                              or x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    chamber=actor,
                    classification=action_type
                )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote['FullStatus'] or
                        "Veto of Governor overridden" in vote['FullStatus']):
                    did_pass = True
                elif ("Failed -- " in vote['FullStatus'] or
                      'Veto of the Governor sustained' in vote['FullStatus']):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                    bill=info['BillNumber'],
                    bill_chamber=bill_chamber
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Esempio n. 33
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (
            senate_url,
            assembly_url,
            bill_chamber,
            bill_type,
            bill_id,
            title,
            (prefix, number, active_version),
        ) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data["summary"],
            classification=bill_type,
        )

        if bill_data["summary"]:
            bill.add_abstract(bill_data["summary"], note="")

        bill_active_version = bill_data["amendments"]["items"][active_version]

        # Parse sponsors.
        if bill_data["sponsor"] is not None:
            if bill_data["sponsor"]["rules"] is True:
                bill.add_sponsorship(
                    "Rules Committee",
                    entity_type="organization",
                    classification="primary",
                    primary=True,
                )
            elif not bill_data["sponsor"]["budget"]:
                primary_sponsor = bill_data["sponsor"]["member"]
                bill.add_sponsorship(
                    primary_sponsor["shortName"],
                    entity_type="person",
                    classification="primary",
                    primary=True,
                )

                # There *shouldn't* be cosponsors if there is no sponsor.
                cosponsors = bill_active_version["coSponsors"]["items"]
                for cosponsor in cosponsors:
                    bill.add_sponsorship(
                        cosponsor["shortName"],
                        entity_type="person",
                        classification="cosponsor",
                        primary=False,
                    )

        # List companion bill.
        same_as = bill_active_version.get("sameAs", {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as["items"]:
            # Get companion bill ID.
            companion_bill_id = same_as["items"][0]["basePrintNo"]

            # Build companion bill session.
            start_year = same_as["items"][0]["session"]
            end_year = start_year + 1
            companion_bill_session = "-".join([str(start_year), str(end_year)])

            # Attach companion bill data.
            bill.add_related_bill(companion_bill_id,
                                  companion_bill_session,
                                  relation_type="companion")

        # Parse actions.
        chamber_map = {"senate": "upper", "assembly": "lower"}

        for action in bill_data["actions"]["items"]:
            chamber = chamber_map[action["chamber"].lower()]
            action_datetime = datetime.datetime.strptime(
                action["date"], "%Y-%m-%d")
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action["text"])

            bill.add_action(
                action["text"],
                action_date.strftime("%Y-%m-%d"),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources[
            "bill"].format(
                session_year=session, bill_id=bill_id, summary="", detail="")
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        for vote_data in bill_data["votes"]["items"]:
            yield self._parse_senate_votes(vote_data, bill, api_url)
        yield from self.scrape_assembly_votes(session, bill, assembly_url,
                                              bill_id)

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data["amendments"]["items"]
        for key, amendment in amendments.items():
            version = amendment["printNo"]

            html_url = ("http://assembly.state.ny.us/leg/?sh=printbill&bn="
                        "{}&term={}&Text=Y".format(bill_id,
                                                   self.term_start_year))
            bill.add_version_link(version,
                                  html_url,
                                  on_duplicate="ignore",
                                  media_type="text/html")

            pdf_url = "http://legislation.nysenate.gov/pdf/bills/{}/{}".format(
                self.term_start_year, version)
            bill.add_version_link(
                version,
                pdf_url,
                on_duplicate="ignore",
                media_type="application/pdf",
            )

        yield bill
Esempio n. 34
0
    def scrape_bills(self, chamber, session, subjects):
        idex = bill_start_numbers(session)[chamber]
        FROM = "ctl00$rilinContent$txtBillFrom"
        TO = "ctl00$rilinContent$txtBillTo"
        YEAR = "ctl00$rilinContent$cbYear"
        blocks = "FOO"  # Ugh.
        while len(blocks) > 0:
            default_headers = get_default_headers(SEARCH_URL)
            default_headers[FROM] = idex
            default_headers[TO] = idex + MAXQUERY
            default_headers[YEAR] = session
            idex += MAXQUERY
            blocks = self.parse_results_page(self.post(SEARCH_URL,
                                             data=default_headers).text)
            blocks = blocks[1:-1]
            blocks = self.digest_results_page(blocks)

            for block in blocks:
                bill = blocks[block]
                subs = []
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    pass

                title = bill['title'][len("ENTITLED, "):]
                billid = bill['bill_id']
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    subs = []

                for b in BILL_NAME_TRANSLATIONS:
                    if billid[:len(b)] == b:
                        billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0]

                b = Bill(
                    billid,
                    title=title,
                    chamber=chamber,
                    legislative_session=session,
                    classification=self.get_type_by_name(bill['bill_id']),
                )
                b.subject = subs

                self.process_actions(bill['actions'], b)
                sponsors = bill['sponsors'][len("BY"):].strip()
                sponsors = sponsors.split(",")
                sponsors = [s.strip() for s in sponsors]

                for href in bill['bill_id_hrefs']:
                    b.add_version_link(
                        href.text, href.attrib['href'],
                        media_type="application/pdf")

                for sponsor in sponsors:
                    b.add_sponsorship(
                        sponsor, entity_type='person', classification='primary', primary=True)

                b.add_source(SEARCH_URL)
                yield b
Esempio n. 35
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if bill_title is None or "Bill does not exist" in history_xml:
            self.warning("Bill does not appear to exist")
            return
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=bill_title,
            classification=bill_type,
        )

        bill.add_source(history_url)

        for subject in root.iterfind("subjects/subject"):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(
                note=self.NAME_SLUGS[version[1][-5]],
                url=version[1],
                media_type="text/html",
            )

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(
                note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]),
                url=analysis[1],
                media_type="text/html",
            )

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(
                note="Fiscal Note ({})".format(
                    self.NAME_SLUGS[fiscal_note[1][-5]]),
                url=fiscal_note[1],
                media_type="text/html",
            )

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(
                note="Witness List ({})".format(
                    self.NAME_SLUGS[witness[1][-5]]),
                url=witness[1],
                media_type="text/html",
            )

        for action in root.findall("actions/action"):
            act_date = datetime.datetime.strptime(action.findtext("date"),
                                                  "%m/%d/%Y").date()

            action_number = action.find("actionNumber").text
            actor = {
                "H": "lower",
                "S": "upper",
                "E": "executive"
            }[action_number[0]]

            desc = action.findtext("description").strip()

            if desc == "Scheduled for public hearing on . . .":
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == "Amended":
                atype = "amendment-passage"
            elif desc == "Amendment(s) offered":
                atype = "amendment-introduction"
            elif desc == "Amendment amended":
                atype = "amendment-amendment"
            elif desc == "Amendment withdrawn":
                atype = "amendment-withdrawal"
            elif desc == "Passed" or desc == "Adopted":
                atype = "passage"
            elif re.match(r"^Received (by|from) the", desc):
                if "Secretary of the Senate" not in desc:
                    atype = "introduction"
                else:
                    atype = "filing"
            elif desc.startswith("Sent to the Governor"):
                # But what if it gets lost in the mail?
                atype = "executive-receipt"
            elif desc.startswith("Signed by the Governor"):
                atype = "executive-signature"
            elif desc.startswith("Effective on"):
                atype = "became-law"
            elif desc == "Vetoed by the Governor":
                atype = "executive-veto"
            elif desc == "Read first time":
                atype = ["introduction", "reading-1"]
                introduced = True
            elif desc == "Read & adopted":
                atype = ["passage"]
                if not introduced:
                    introduced = True
                    atype.append("introduction")
            elif desc == "Passed as amended":
                atype = "passage"
            elif desc.startswith("Referred to") or desc.startswith(
                    "Recommended to be sent to "):
                atype = "referral-committee"
            elif desc == "Reported favorably w/o amendment(s)":
                atype = "committee-passage"
            elif desc == "Filed":
                atype = "filing"
            elif desc == "Read 3rd time":
                atype = "reading-3"
            elif desc == "Read 2nd time":
                atype = "reading-2"
            elif desc.startswith("Reported favorably"):
                atype = "committee-passage-favorable"
            else:
                atype = None

            act = bill.add_action(
                action.findtext("description"),
                act_date,
                chamber=actor,
                classification=atype,
            )

            if atype and "referral-committee" in atype:
                repls = ["Referred to", "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type="organization")

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsorship(author,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsorship(
                    coauthor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsorship(
                    sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsorship(
                    cosponsor,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        if root.findtext("companions"):
            self._get_companion(bill)

        yield bill
Esempio n. 36
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = "%s?r=%s" % (self.base_url, bill_id)
        html = self.get(url).text
        if "error '80020009'" in html:
            self.warning("asp error on page, skipping %s", bill_id)
            return
        doc = lxml.html.fromstring(html)
        # search for Titulo, accent over i messes up lxml, so use 'tulo'
        title = doc.xpath(
            u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()')
        if not title:
            raise NoSuchBill()

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title[0],
            classification=bill_type,
        )

        author = doc.xpath(u'//td/b[contains(text(),"Autor")]/../text()')[0]
        for aname in author.split(","):
            aname = self.clean_name(aname).strip()
            if aname:
                bill.add_sponsorship(aname,
                                     classification="primary",
                                     entity_type="person",
                                     primary=True)

        co_authors = doc.xpath(
            u'//td/b[contains(text(),"Co-autor")]/../text()')
        if len(co_authors) != 0:
            for co_author in co_authors[1].split(","):
                bill.add_sponsorship(
                    self.clean_name(co_author).strip(),
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )

        action_table = doc.xpath("//table")[-1]
        bill_vote_chamber = None
        for row in action_table[1:]:
            tds = row.xpath("td")
            # ignore row missing date
            if len(tds) != 2:
                continue
            if tds[0].text_content():
                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")
            action = tds[1].text_content().strip()
            # parse the text to see if it's a new version or a unrelated document
            # if has a hyphen let's assume it's a vote document

            # get url of action
            action_url = tds[1].xpath("a/@href")
            atype, action = self.parse_action(chamber, bill, action,
                                              action_url, date)

            # Some lower-house roll calls could be parsed, but finnicky
            # Most roll lists are just images embedded within a document,
            # and offer no alt text to scrape
            # Instead, just scrape the vote counts
            regex = r"(?u)^(.*),\s([\s\d]{2})-([\s\d]{2})-([\s\d]{2})-([\s\d]{0,2})$"
            vote_info = re.search(regex, action)
            if vote_info and re.search(r"\d{1,2}", action):
                vote_name = vote_info.group(1)

                if u"Votación Final" in vote_name:
                    (vote_chamber,
                     vote_name) = re.search(r"(?u)^\w+ por (.*?) en (.*)$",
                                            vote_name).groups()
                    if "Senado" in vote_chamber:
                        vote_chamber = "upper"
                    else:
                        vote_chamber = "lower"

                elif "Cuerpo de Origen" in vote_name:
                    vote_name = re.search(r"(?u)^Cuerpo de Origen (.*)$",
                                          vote_name).group(1)
                    vote_chamber = chamber

                elif u"informe de Comisión de Conferencia" in vote_name:
                    (vote_chamber, vote_name) = re.search(
                        r"(?u)^(\w+) (\w+ informe de Comisi\wn de Conferencia)$",
                        vote_name,
                    ).groups()
                    if vote_chamber == "Senado":
                        vote_chamber = "upper"
                    else:
                        vote_chamber = "lower"

                # TODO replace bill['votes']
                elif u"Se reconsideró" in vote_name:
                    if bill_vote_chamber:
                        vote_chamber = bill_vote_chamber
                    else:
                        vote_chamber = chamber

                else:
                    raise AssertionError(
                        u"Unknown vote text found: {}".format(vote_name))

                vote_name = vote_name.title()

                yes = int(vote_info.group(2))
                no = int(vote_info.group(3))
                other = 0
                if vote_info.group(4).strip():
                    other += int(vote_info.group(4))
                if vote_info.group(5).strip():
                    other += int(vote_info.group(5))

                vote = Vote(
                    chamber=vote_chamber,
                    start_date=date.strftime("%Y-%m-%d"),
                    motion_text=vote_name,
                    result="pass" if (yes > no) else "fail",
                    bill=bill,
                    classification="passage",
                )
                vote.set_count("yes", yes)
                vote.set_count("no", no)
                vote.set_count("other", other)
                vote.add_source(url)
                yield vote
                bill_vote_chamber = chamber

        bill.add_source(url)
        yield bill
Esempio n. 37
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r"^(S|H)B ", bill_id):
            btype = ["bill"]
        elif re.match(r"(S|H)C ", bill_id):
            btype = ["commemoration"]
        elif re.match(r"(S|H)JR ", bill_id):
            btype = ["joint resolution"]
        elif re.match(r"(S|H)CR ", bill_id):
            btype = ["concurrent resolution"]
        else:
            btype = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=btype,
        )
        bill.add_source(url)

        version_rows = page.xpath(
            "//div[@id=\"ctl00_ContentPlaceHolder1_ctl00_BillVersions\"]"
            + "/section/table/tbody/tr"
        )
        assert len(version_rows) > 0
        for row in version_rows:
            (date,) = row.xpath("./td[@data-title=\"Date\"]/text()")
            date = date.strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            (html_note,) = row.xpath("./td[@data-title=\"HTML\"]/a/text()")
            (html_link,) = row.xpath("./td[@data-title=\"HTML\"]/a/@href")
            (pdf_note,) = row.xpath("./td[@data-title=\"PDF\"]/a/text()")
            (pdf_link,) = row.xpath("./td[@data-title=\"PDF\"]/a/@href")

            assert html_note == pdf_note
            note = html_note

            bill.add_version_link(
                note,
                html_link,
                date=date,
                media_type="text/html",
                on_duplicate="ignore",
            )
            bill.add_version_link(
                note,
                pdf_link,
                date=date,
                media_type="application/pdf",
                on_duplicate="ignore",
            )

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]'
            + '/label[contains(text(), "Sponsors:")]'
            + "/following-sibling::div[1]/p/a"
        )
        for link in sponsor_links:
            if link.attrib["href"].startswith("https://sdlegislature.gov/Legislators/"):
                sponsor_type = "person"
            elif link.attrib["href"].startswith(
                "https://sdlegislature.gov/Legislative_Session/Committees"
            ):
                sponsor_type = "organization"
            else:
                raise ScrapeError(
                    "Found unexpected sponsor, URL: " + link.attrib["href"]
                )
            bill.add_sponsorship(
                link.text,
                classification="primary",
                primary=True,
                entity_type=sponsor_type,
            )

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == "":
                self.debug("Skipping action table row that is completely empty")
                continue

            if "Date" in row.text_content() and "Action" in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith("First read"):
                atypes.append("introduction")
                atypes.append("reading-1")

            if re.match(r"Signed by (?:the\s)*Governor", action, re.IGNORECASE):
                atypes.append("executive-signature")
                actor = "executive"

            match = re.match(r"(.*) Do Pass( Amended)?, (Passed|Failed)", action)
            if match:
                if match.group(1) in ["Senate", "House of Representatives"]:
                    first = ""
                else:
                    first = "committee-"
                if match.group(3).lower() == "passed":
                    second = "passage"
                elif match.group(3).lower() == "failed":
                    second = "failure"
                atypes.append("%s%s" % (first, second))

            if "referred to" in action.lower():
                atypes.append("referral-committee")

            if "Motion to amend, Passed Amendment" in action:
                atypes.append("amendment-introduction")
                atypes.append("amendment-passage")
                if row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]'):
                    amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0]
                    version_name = amd.xpath("string(.)")
                    version_url = amd.xpath("@href")[0]
                    if "htm" in version_url:
                        mimetype = "text/html"
                    elif "pdf" in version_url:
                        mimetype = "application/pdf"
                    bill.add_version_link(
                        version_name,
                        version_url,
                        media_type=mimetype,
                        on_duplicate="ignore",
                    )

            if "Veto override, Passed" in action:
                atypes.append("veto-override-passage")
            elif "Veto override, Failed" in action:
                atypes.append("veto-override-failure")

            if "Delivered to the Governor" in action:
                atypes.append("executive-receipt")

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == "Senate":
                    actor = "upper"
                else:
                    actor = "lower"

            date = row.xpath("string(td[1])").strip()
            match = re.match(r"\d{2}/\d{2}/\d{4}", date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib["href"])

            if action:
                bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Esempio n. 38
0
    def _parse_house_bill(self, url, session):
        # using the print page makes the page simpler, and also *drastically* smaller
        # (8k rather than 100k)
        url = re.sub("billsummary", "billsummaryprn", url)
        url = '%s/%s' % (self._house_base_url, url)

        # the URL is an iframed version now, so swap in for the actual bill page

        url = url.replace('Bill.aspx', 'BillContent.aspx')
        url = url.replace('&code=R', '&code=R&style=new')

        # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R
        # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new

        bill_page = self.get(url).text
        bill_page = lxml.html.fromstring(bill_page)
        bill_page.make_links_absolute(url)

        bill_id = bill_page.xpath('//*[@class="entry-title"]/div')
        if len(bill_id) == 0:
            self.info("WARNING: bill summary page is blank! (%s)" % url)
            self._bad_urls.append(url)
            return
        bill_id = bill_id[0].text_content()
        bill_id = clean_text(bill_id)

        bill_desc = bill_page.xpath(
            '//*[@class="BillDescription"]')[0].text_content()
        bill_desc = clean_text(bill_desc)

        table_rows = bill_page.xpath('//table/tr')
        # if there is a cosponsor all the rows are pushed down one for the extra row
        # for the cosponsor:
        cosponsorOffset = 0
        if table_rows[2][0].text_content().strip() == 'Co-Sponsor:':
            cosponsorOffset = 1

        lr_label_tag = table_rows[3 + cosponsorOffset]
        assert lr_label_tag[0].text_content().strip() == 'LR Number:'
        # bill_lr = lr_label_tag[1].text_content()

        lastActionOffset = 0
        if table_rows[4 + cosponsorOffset][0].text_content().strip(
        ) == 'Governor Action:':
            lastActionOffset = 1
        official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset]
        assert official_title_tag[0].text_content().strip() == 'Bill String:'
        official_title = official_title_tag[1].text_content()

        # could substitute the description for the name,
        # but keeping it separate for now.

        bill_type = "bill"
        triplet = bill_id[:3]

        if triplet in bill_types:
            bill_type = bill_types[triplet]
            bill_number = int(bill_id[3:].strip())
        else:
            bill_number = int(bill_id[3:])

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bill_desc == "":
            if bill_number <= 20:
                # blank bill titles early in session are approp. bills
                bill_desc = 'Appropriations Bill'
            else:
                self.error("Blank title. Skipping. {} / {} / {}".format(
                    bill_id, bill_desc, official_title))
                return

        bill = Bill(
            bill_id,
            chamber='lower',
            title=bill_desc,
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_title(official_title, note='official')

        bill.add_source(url)

        bill_sponsor = clean_text(table_rows[0][1].text_content())
        # try:
        #     bill_sponsor_link = table_rows[0][1][0].attrib['href']
        # except IndexError:
        #     return
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # check for cosponsors
        sponsors_url, = bill_page.xpath(
            "//a[contains(@href, 'CoSponsors.aspx')]/@href")
        self._parse_cosponsors_from_bill(bill, sponsors_url)

        # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0]
        # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href'])
        # actions_link = re.sub("content", "print", actions_link)

        actions_link, = bill_page.xpath(
            "//a[contains(@href, 'BillActions.aspx')]/@href")
        yield from self._parse_house_actions(bill, actions_link)

        # get bill versions
        doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span')
        for doc_tag in reversed(doc_tags):
            doc = clean_text(doc_tag.text_content())
            text_url = '%s%s' % (self._house_base_url,
                                 doc_tag[0].attrib['href'])
            bill.add_document_link(doc, text_url, media_type='text/html')

        # get bill versions
        version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span')
        for version_tag in reversed(version_tags):
            version = clean_text(version_tag.text_content())
            for vurl in version_tag.xpath(".//a"):
                if vurl.text == 'PDF':
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version,
                                      vurl.attrib['href'],
                                      media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill versions
        # everything between the row containing "Bill Text" in an h2 and the next div.DocHeaderRow
        version_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Text")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )
        for row in version_rows:
            # some rows are just broken links, not real versions
            if row.xpath('.//div[contains(@class,"textType")]/a/@href'):
                version = row.xpath(
                    './/div[contains(@class,"textType")]/a/text()')[0].strip()
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_version_link(version,
                                      path,
                                      media_type=mimetype,
                                      on_duplicate='ignore')

        # house bill summaries
        # everything between the row containing "Bill Summary" in an h2
        # and the next div.DocHeaderRow
        summary_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Bill Summary")]]/'
            'following-sibling::div[contains(@class,"DocRow") '
            'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]'
        )

        # if there are no amedments, we need a different xpath for summaries
        if not summary_rows:
            summary_rows = bill_page.xpath(
                '//div[h2[contains(text(),"Bill Summary")]]/'
                'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(summary_rows):
            version = row.xpath(
                './/div[contains(@class,"textType")]/a/text()')[0].strip()
            if version:
                path = row.xpath(
                    './/div[contains(@class,"textType")]/a/@href')[0].strip()
                summary_name = 'Bill Summary ({})'.format(version)
                if '.pdf' in path:
                    mimetype = 'application/pdf'
                else:
                    mimetype = 'text/html'
                bill.add_document_link(summary_name,
                                       path,
                                       media_type=mimetype,
                                       on_duplicate='ignore')

        # house bill amendments
        amendment_rows = bill_page.xpath(
            '//div[h2[contains(text(),"Amendment")]]/'
            'following-sibling::div[contains(@class,"DocRow")]')

        for row in reversed(amendment_rows):
            version = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip(
                )
            path = row.xpath(
                './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip(
                )
            summary_name = 'Amendment {}'.format(version)

            defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]')
            if defeated_icon:
                summary_name = '{} (Defeated)'.format(summary_name)

            adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]')
            if adopted_icon:
                summary_name = '{} (Adopted)'.format(summary_name)

            distributed_icon = row.xpath(
                './/img[contains(@title,"Distributed")]')
            if distributed_icon:
                summary_name = '{} (Distributed)'.format(summary_name)

            if '.pdf' in path:
                mimetype = 'application/pdf'
            else:
                mimetype = 'text/html'
            bill.add_version_link(summary_name,
                                  path,
                                  media_type=mimetype,
                                  on_duplicate='ignore')

        yield bill
Esempio n. 39
0
    def scrape_chamber(self, chamber, session):
        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        bill_request = self.get(ksapi.url + 'bill_status/').text
        bill_request_json = json.loads(bill_request)
        bills = bill_request_json['content']
        for bill_data in bills:

            bill_id = bill_data['BILLNO']

            # filter other chambers
            if not bill_id.startswith(chamber_letter):
                continue

            if 'CR' in bill_id:
                btype = 'concurrent resolution'
            elif 'R' in bill_id:
                btype = 'resolution'
            elif 'B' in bill_id:
                btype = 'bill'

            title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

            # main
            bill = Bill(
                bill_id,
                session,
                title,
                chamber=chamber,
                classification=btype,
            )
            bill.extras = {'status': bill_data['STATUS']}

            bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

            if (bill_data['LONGTITLE'] and
                    bill_data['LONGTITLE'] != bill.title):
                bill.add_title(bill_data['LONGTITLE'])

            for sponsor in bill_data['SPONSOR_NAMES']:
                stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                         else 'cosponsor')
                if sponsor:
                    bill.add_sponsorship(
                        name=sponsor,
                        entity_type='person',
                        primary=stype == 'primary',
                        classification=stype,
                    )

            # history is backwards
            for event in reversed(bill_data['HISTORY']):
                actor = ('upper' if event['chamber'] == 'Senate'
                         else 'lower')

                date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                # append committee names if present
                if 'committee_names' in event:
                    action = (event['status'] + ' ' +
                              ' and '.join(event['committee_names']))
                else:
                    action = event['status']

                if event['action_code'] not in ksapi.action_codes:
                    self.warning('unknown action code on %s: %s %s' %
                                 (bill_id, event['action_code'],
                                  event['status']))
                    atype = None
                else:
                    atype = ksapi.action_codes[event['action_code']]
                bill.add_action(
                    action, date.strftime('%Y-%m-%d'), chamber=actor, classification=atype)

            try:
                yield from self.scrape_html(bill, session)
            except scrapelib.HTTPError as e:
                self.warning('unable to fetch HTML for bill {0}'.format(
                    bill['bill_id']))

            yield bill
Esempio n. 40
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        try:
            hist_url = sidebar.xpath(
                '//a[contains(., "Bill History")]')[0].attrib['href']
        except IndexError:
            # where is it?
            return

        page = lxml.html.fromstring(self.get(hist_url).text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath('string(//div[@id="content"]/div[@class='
                           '"divideVert"]/div[not(@class)])').strip()

        if title == '':
            self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url)
            return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if 'HR' in bill_id or 'SR' in bill_id:
            bill_type = ['resolution']
        elif 'HJR' in bill_id or 'SJR' in bill_id:
            bill_type = ['joint resolution']
        elif 'HCR' in bill_id or 'SCR' in bill_id:
            bill_type = ['concurrent resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = 'https://www.legis.iowa.gov/docs/'\
            'publications/LG{}/{}/attachments/{}.html'
        version_pdf_url_template = 'https://www.legis.iowa.gov/docs/'\
            'publications/LG{}/{}/{}.pdf'

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath('string(@value)')

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(' ', ''))

                bill.add_version_link(note=version_name,
                                      url=version_html_url,
                                      media_type='text/html')

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(' ', ''))

                bill.add_version_link(note=version_name,
                                      url=version_pdf_url,
                                      media_type='application/pdf')

        sponsors_str = page.xpath(
            "string(//div[@id='content']/div[@class='divideVert']/div[@class='divideVert'])"
        ).strip()
        if re.search('^By ', sponsors_str):
            sponsors = re.split(',| and ', sponsors_str.split('By ')[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall('[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)',
                                  sponsors_str)

        for sponsor in sponsors:
            sponsor = sponsor.replace(' and', '').strip(' .,')

            # a few sponsors get mangled by our regex
            sponsor = {
                'Means': 'Ways & Means',
                'Iowa': 'Economic Growth/Rebuild Iowa',
                'Safety': 'Public Safety',
                'Resources': 'Human Resources',
                'Affairs': 'Veterans Affairs',
                'Protection': 'Environmental Protection',
                'Government': 'State Government',
                'Boef': 'De Boef'
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        for tr in page.xpath(
                "//table[contains(@class, 'billActionTable')]/tbody/tr"):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[2])").strip()
            action = re.sub(r'\s+', ' ', action)

            # Capture any amendment links.
            links = [
                link
                for link in [version['links'] for version in bill.versions]
            ]
            version_urls = [
                link['url'] for link in [i for sub in links for i in sub]
            ]
            if 'amendment' in action.lower():
                for anchor in tr.xpath('td[2]/a'):
                    if '-' in anchor.text:
                        # These links aren't given hrefs for some reason
                        # (needs to be fixed upstream)
                        try:
                            url = anchor.attrib['href']
                        except KeyError:
                            continue

                        if url not in version_urls:
                            bill.add_version_link(note=anchor.text,
                                                  url=url,
                                                  media_type='text/html')
                            version_urls.append(url)

            if 'S.J.' in action or 'SCS' in action:
                actor = 'upper'
            elif 'H.J.' in action or 'HCS' in action:
                actor = 'lower'
            else:
                actor = "legislature"

            action = re.sub(r'(H|S)\.J\.\s+\d+\.$', '', action).strip()

            if action.startswith('Introduced'):
                atype = ['introduction']
                if ', referred to' in action:
                    atype.append('referral-committee')
            elif action.startswith('Read first time'):
                atype = 'reading-1'
            elif action.startswith('Referred to'):
                atype = 'referral-committee'
            elif action.startswith('Sent to Governor'):
                atype = 'executive-receipt'
            elif action.startswith('Reported Signed by Governor'):
                atype = 'executive-signature'
            elif action.startswith('Signed by Governor'):
                atype = 'executive-signature'
            elif action.startswith('Vetoed by Governor'):
                atype = 'executive-veto'
            elif action.startswith('Item veto'):
                atype = 'executive-veto-line-item'
            elif re.match(r'Passed (House|Senate)', action):
                atype = 'passage'
            elif re.match(r'Amendment (S|H)-\d+ filed', action):
                atype = ['amendment-introduction']
                if ', adopted' in action:
                    atype.append('amendment-passage')
            elif re.match(r'Amendment (S|H)-\d+( as amended,)? adopted',
                          action):
                atype = 'amendment-passage'
            elif re.match('Amendment (S|N)-\d+ lost', action):
                atype = 'amendment-failure'
            elif action.startswith('Resolution filed'):
                atype = 'introduction'
            elif action.startswith('Resolution adopted'):
                atype = 'passage'
            elif (action.startswith('Committee report')
                  and action.endswith('passage.')):
                atype = 'committee-passage'
            elif action.startswith('Withdrawn'):
                atype = 'withdrawal'
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search('END OF \d+ ACTIONS', action):
                continue

            if '$history' not in action:
                bill.add_action(description=action,
                                date=date,
                                chamber=actor,
                                classification=atype)

        for subject in self._subjects[bill_id]:
            bill.add_subject(subject['Name'])

        yield bill
Esempio n. 41
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {
                "Senate": "upper",
                "House": "lower",
                "House of Representatives": "lower",
                "house": "lower",
                "senate": "upper"
            }

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {
                "approved": True,
                "passed": True,
                "adopted": True,
                "true": True,
                "false": False,
                "failed": False,
                True: True,
                False: False
            }

            action_dict = {
                "ref_ctte_100": "referral-committee",
                "intro_100": "introduction",
                "pass_300": "passage",
                "intro_110": "reading-1",
                "refer_210": "referral-committee",
                "crpt_301": None,
                "crpt_317": None,
                "concur_606": "passage",
                "pass_301": "passage",
                "refer_220": "referral-committee",
                "intro_102": ["introduction", "passage"],
                "intro_105": ["introduction", "passage"],
                "intro_ref_ctte_100": "referral-committee",
                "refer_209": None,
                "intro_108": ["introduction", "passage"],
                "intro_103": ["introduction", "passage"],
                "msg_reso_503": "passage",
                "intro_107": ["introduction", "passage"],
                "imm_consid_360": "passage",
                "refer_213": None,
                "adopt_reso_100": "passage",
                "msg_507": "amendment-passage",
                "confer_713": None,
                "concur_603": None,
                "confer_712": None,
                "msg_506": "amendment-failure",
                "receive_message_100": "passage",
                "motion_920": None,
                "concur_611": None,
                "confer_735": None
            }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(
                session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(
                first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url,
                                                     "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url,
                                                      "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url,
                                                      "analysiss")

            for row in self.get_bill_rows(session):
                number_link, _ga, title, primary_sponsor, status = row.xpath(
                    'td')

                bill_id = number_link.text_content()
                title = title.text_content().strip()
                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=title,
                            classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = (
                    'http://search-prod.lis.state.oh.us/solarapi/v1/'
                    'general_assembly_{}/{}/{}/'.format(
                        session, 'bills' if 'B' in bill_id else 'resolutions',
                        bill_id.lower().replace(' ', '')))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url + version["pdfDownloadLink"]
                    bill.add_version_link(version_name,
                                          version_link,
                                          media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(sponsor_name,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                        sponsor_name,
                        classification='cosponsor',
                        entity_type='person',
                        primary=False,
                    )

                try:
                    action_doc = self.get(base_url +
                                          bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning(
                                "Unknown action {desc} with code {code}."
                                " Add it to the action_dict"
                                ".".format(desc=action_desc,
                                           code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(
                            datetime.datetime.strptime(action["datetime"],
                                                       "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date,
                                        chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill,
                                  base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill,
                                  base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill,
                                  base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill,
                                  base_url)

                # votes
                vote_url = base_url + bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url, base_url, bill,
                                             legislators, chamber_dict,
                                             vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url, base_url, bill,
                                             legislators, chamber_dict,
                                             vote_results)

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url + bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url + bill_version["disapprove"][0][
                        "link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError(
                            "Whoa, a disapprove! We've never"
                            " gotten one before."
                            " Go write some code to deal "
                            "with it: {}".format(disapprove_url))

                yield bill
Esempio n. 42
0
    def scrape(self, session=None):
        if not session:
            session = self.latest_session()
            self.info('no session specified, using %s', session)

        # get member id matching for vote parsing
        member_ids = self.get_member_ids()[session]
        per_page = 10  # seems like it gives 10 no matter what.
        start_record = 0

        headers = {"Content-Type": "application/json"}
        url = ("http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/"
               "GetPublicAdvancedSearch")
        bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData"
        params = {
            "request": {
                "sEcho": 2,
                "iColumns": 4,
                "sColumns": "",
                "iDisplayStart": 0,
                "iDisplayLength": per_page,
                "mDataProp_0": "ShortTitle",
                "mDataProp_1": "Title",
                "mDataProp_2": "LegislationCategories",
                "mDataProp_3": "Modified",
                "iSortCol_0": 0,
                "sSortDir_0": "asc",
                "iSortingCols": 0,
                "bSortable_0": "true",
                "bSortable_1": "true",
                "bSortable_2": "true",
                "bSortable_3": "true"
            },
            "criteria": {
                "Keyword": "",
                "Category": "",
                "SubCategoryId": "",
                "RequestOf": "",
                "CouncilPeriod": str(session),
                "Introducer": "",
                "CoSponsor": "",
                "CommitteeReferral": "",
                "CommitteeReferralComments": "",
                "StartDate": "",
                "EndDate": "",
                "QueryLimit": 100,
                "FilterType": "",
                "Phases": "",
                "LegislationStatus": "0",
                "IncludeDocumentSearch": "false"
            }
        }
        param_json = json.dumps(params)
        response = self.post(url, headers=headers, data=param_json)
        # the response is a terrible string-of-nested-json-strings. Yuck.
        response = decode_json(response.json()["d"])
        data = response["aaData"]

        global bill_versions

        while len(data) > 0:

            for bill in data:
                # sometimes they're in there more than once, so we'll keep track
                bill_versions = []

                bill_id = bill["Title"]
                if bill_id.startswith("AG"):
                    # actually an agenda, skip
                    continue
                bill_params = {"legislationId": bill_id}
                bill_info = self.post(bill_url,
                                      headers=headers,
                                      data=json.dumps(bill_params))
                bill_info = decode_json(bill_info.json()["d"])["data"]
                bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id

                legislation_info = bill_info["Legislation"][0]
                title = legislation_info["ShortTitle"]

                if bill_id.startswith("R") or bill_id.startswith("CER"):
                    bill_type = "resolution"
                else:
                    bill_type = "bill"

                bill = Bill(bill_id,
                            legislative_session=session,
                            title=title,
                            classification=bill_type)

                # sponsors and cosponsors
                if "Introducer" in legislation_info:
                    introducers = legislation_info["Introducer"]
                    intro_date = self.date_format(
                        legislation_info["IntroductionDate"])
                    bill.add_action("Introduced",
                                    intro_date,
                                    classification="introduction")
                else:
                    # sometimes there are introducers, sometimes not.
                    # Set Introducers to empty array to avoid downstream breakage,
                    # but log bills without introducers
                    self.logger.warning("No Introducer: {0}".format(
                        bill.identifier))
                    introducers = []

                try:
                    # sometimes there are cosponsors, sometimes not.
                    cosponsors = legislation_info["CoSponsor"]
                except KeyError:
                    cosponsors = []

                for i in introducers:
                    name = i["Name"]
                    # they messed up Phil Mendelson's name
                    if name == "Phil Pmendelson":
                        name = "Phil Mendelson"
                    bill.add_sponsorship(name,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)

                for s in cosponsors:
                    name = s["Name"]
                    if name == "Phil Pmendelson":
                        name = "Phil Mendelson"
                    bill.add_sponsorship(name=name,
                                         classification="cosponsor",
                                         entity_type='person',
                                         primary=False)

                # if it's become law, add the law number as an alternate title
                if "LawNumber" in legislation_info:
                    law_num = legislation_info["LawNumber"]
                    if law_num:
                        bill.add_title(law_num)

                # also sometimes it's got an act number
                if "ActNumber" in legislation_info:
                    act_num = legislation_info["ActNumber"]
                    if act_num:
                        bill.add_title(act_num)

                # sometimes AdditionalInformation has a previous bill name
                if "AdditionalInformation" in legislation_info:
                    add_info = legislation_info["AdditionalInformation"]
                    if "previously" in add_info.lower():
                        prev_title = add_info.lower().replace(
                            "previously", "").strip().replace(" ", "")
                        bill.add_title(prev_title.upper())
                    elif add_info:
                        bill.extras["additional_information"] = add_info

                if "WithDrawnDate" in legislation_info:
                    withdrawn_date = self.date_format(
                        legislation_info["WithDrawnDate"])
                    withdrawn_by = legislation_info["WithdrawnBy"][0][
                        "Name"].strip()
                    if withdrawn_by == "the Mayor":

                        bill.add_action("withdrawn",
                                        withdrawn_date,
                                        chamber="executive",
                                        classification="withdrawal")

                    elif "committee" in withdrawn_by.lower():
                        a = bill.add_action("withdrawn",
                                            withdrawn_date,
                                            classification="withdrawal")
                        a.add_related_entity(withdrawn_by,
                                             entity_type='organization')
                    else:
                        a = bill.add_action("withdrawn",
                                            withdrawn_date,
                                            classification="withdrawal")
                        a.add_related_entity(withdrawn_by,
                                             entity_type='person')

                # deal with actions involving the mayor
                mayor = bill_info["MayorReview"]
                if mayor != []:
                    mayor = mayor[0]

                    if "TransmittedDate" in mayor:
                        transmitted_date = self.date_format(
                            mayor["TransmittedDate"])

                        bill.add_action("transmitted to mayor",
                                        transmitted_date,
                                        chamber="executive",
                                        classification="executive-receipt")

                    if 'SignedDate' in mayor:
                        signed_date = self.date_format(mayor["SignedDate"])

                        bill.add_action("signed",
                                        signed_date,
                                        chamber="executive",
                                        classification="executive-signature")

                    # if returned but not signed, it was vetoed
                    elif 'ReturnedDate' in mayor:
                        veto_date = self.date_format(mayor["ReturnedDate"])

                        bill.add_action("vetoed",
                                        veto_date,
                                        chamber="executive",
                                        classification="executive-veto")

                        # if it was returned and enacted but not signed, there was a veto override
                        if 'EnactedDate' in mayor:
                            override_date = self.date_format(
                                mayor["EnactedDate"])

                            bill.add_action(
                                "veto override",
                                override_date,
                                classification="veto-override-passage")

                    if 'AttachmentPath' in mayor:
                        # documents relating to the mayor's review
                        self.add_documents(mayor["AttachmentPath"], bill)

                congress = bill_info["CongressReview"]
                if len(congress) > 0:
                    congress = congress[0]
                    if "TransmittedDate" in congress:
                        transmitted_date = self.date_format(
                            congress["TransmittedDate"])

                        bill.add_action("Transmitted to Congress for review",
                                        transmitted_date)

                # deal with committee actions
                if "DateRead" in legislation_info:
                    date = legislation_info["DateRead"]
                elif "IntroductionDate" in legislation_info:
                    date = legislation_info["IntroductionDate"]
                else:
                    self.logger.warning(
                        "we can't find anything that looks like an "
                        "action date. Skipping")
                    continue
                date = self.date_format(date)
                if "CommitteeReferral" in legislation_info:
                    committees = []
                    for committee in legislation_info["CommitteeReferral"]:
                        if committee["Name"].lower(
                        ) == "retained by the council":
                            committees = []
                            break
                        else:
                            committees.append(committee["Name"])
                    if committees != []:
                        a = bill.add_action(
                            "referred to committee",
                            date,
                            classification="referral-committee")
                        for com in committees:
                            a.add_related_entity(com,
                                                 entity_type='organization')

                if "CommitteeReferralComments" in legislation_info:
                    a = bill.add_action("comments from committee", date)
                    for committee in legislation_info[
                            "CommitteeReferralComments"]:
                        a.add_related_entity(committee["Name"],
                                             entity_type='organization')

                # deal with random docs floating around
                docs = bill_info["OtherDocuments"]
                for d in docs:
                    if "AttachmentPath" in d:
                        self.add_documents(d["AttachmentPath"], bill)
                    else:
                        self.logger.warning(
                            "Document path missing from 'Other Documents'")

                if "MemoLink" in legislation_info:
                    self.add_documents(legislation_info["MemoLink"], bill)

                if "AttachmentPath" in legislation_info:
                    self.add_documents(legislation_info["AttachmentPath"],
                                       bill)

                # full council votes
                votes = bill_info["VotingSummary"]
                for vote in votes:
                    v = self.process_vote(vote, bill, member_ids)
                    if v:
                        v.add_source(bill_source_url)
                        yield v

                # deal with committee votes
                if "CommitteeMarkup" in bill_info:
                    committee_info = bill_info["CommitteeMarkup"]
                    if len(committee_info) > 0:
                        for committee_action in committee_info:
                            v = self.process_committee_vote(
                                committee_action, bill)
                            if v:
                                v.add_source(bill_source_url)
                                yield v
                        if "AttachmentPath" in committee_info:
                            self.add_documents(vote["AttachmentPath"], bill)

                bill.add_source(bill_source_url)
                yield bill

            # get next page
            start_record += per_page
            params["request"]["iDisplayStart"] = start_record
            param_json = json.dumps(params)
            response = self.post(url, headers=headers, data=param_json)
            response = decode_json(response.json()["d"])
            data = response["aaData"]
Esempio n. 43
0
    def scrape_bill_type(
            self,
            chamber,
            session,
            bill_type,
            type_abbr,
            committee_abbr_regex=get_committee_name_regex(),
    ):
        bills = (self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr))

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title="", chamber=chamber)
            if (bill_id.startswith("S")
                    and chamber == "lower") or (bill_id.startswith("A")
                                                and chamber == "upper"):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ("http://leginfo.legislature.ca.gov/faces/"
                          "billNavClient.xhtml?bill_id=%s") % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type="text/html")

            title = ""
            type_ = ["bill"]
            subject = ""
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = "//caml:DigestText/xhtml:p"
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r"\s+", " ", t)
                    t = re.sub(r"\)(\S)", lambda m: ") %s" % m.group(1), t)
                    chunks.append(t)
                summary = "\n\n".join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime("%m/%d/%y")
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type="application/pdf",
                    date=version_date.date(),
                )

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ("AB", "SB"):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(
                            version.short_title) and not version.title.lower(
                            ).startswith("an act"):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == "Yes":
                    type_.append("appropriation")

                tags = []
                if version.fiscal_committee == "Yes":
                    tags.append("fiscal committee")
                if version.local_program == "Yes":
                    tags.append("local program")
                if version.urgency == "Yes":
                    tags.append("urgency")
                if version.taxlevy == "Yes":
                    tags.append("tax levy")

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note="summary")
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras["impact_clause"] = impact_clause
            fsbill.extras["tags"] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == "Y",
                    entity_type="person",
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {
                        "Assembly": "lower",
                        "Senate": "upper"
                    }[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                "Assembly": "lower",
                                "Senate": "upper"
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r"^(Assembly|Senate)", replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r"\s+", " ", act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r"Com[s]?. on",
                             action.action) and not matched_abbrs:
                    msg = "Failed to extract committee abbr from %r."
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ("Mapping contains no committee name for "
                                   "abbreviation %r. Action text was %r.")
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs["committees"] = committees

                    code = re.search(r"C[SXZ]\d+", actor)
                    if code is not None:
                        code = code.group()
                        kwargs["actor_info"] = {"committee_code": code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace("Coms. on ", "")
                        act_str = act_str.replace("Com. on " + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith("."):
                            act_str = act_str + "."

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ["upper", "lower", "legislature"]:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = "legislature"

                if actor != action.actor:
                    actor_info = kwargs.get("actor_info", {})
                    actor_info["details"] = action.actor
                    kwargs["actor_info"] = actor_info

                # Add strings for related legislators, if any.
                rgx = r"(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+"
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs["legislators"] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime("%Y-%m-%d"),
                    chamber=actor,
                    classification=kwargs["classification"],
                )
                for committee in kwargs.get("committees", []):
                    action.add_related_entity(committee,
                                              entity_type="organization")
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == "(PASS)":
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(" ")[0].lower()
                if first_part in ["asm", "assembly"]:
                    vote_chamber = "lower"
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith("sen"):
                    vote_chamber = "upper"
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ""
                else:
                    motion = ""

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = "passage"
                elif "Do Pass" in motion:
                    vtype = "passage"
                else:
                    vtype = "other"

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r"(\w+)( Extraordinary)? Session$",
                                    re.IGNORECASE).sub("", motion)
                motion = re.compile(r"^(Senate|Assembly) ",
                                    re.IGNORECASE).sub("", motion)
                motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ", "",
                                motion)
                motion = re.sub(r" \(\w+\)$", "", motion)
                motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "",
                                motion)
                motion = re.sub(
                    r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? "
                    r"Urgency Clause$",
                    "(Urgency Clause)",
                    motion,
                )
                motion = re.sub(r"\s+", " ", motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result="pass" if result else "fail",
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {"threshold": vote.threshold}

                source_url = ("http://leginfo.legislature.ca.gov/faces"
                              "/billVotesClient.xhtml?bill_id={}").format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + "#" + str(vote_num)

                rc = {"yes": [], "no": [], "other": []}
                for record in vote.votes:
                    if record.vote_code == "AYE":
                        rc["yes"].append(record.legislator_name)
                    elif record.vote_code.startswith("NO"):
                        rc["no"].append(record.legislator_name)
                    else:
                        rc["other"].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Esempio n. 44
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=
            session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type)

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='person')
        for sponsor in doc.xpath(
                '//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='organization')

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.
                text,  # Description of the version from the state;
                #  eg, 'As introduced', 'Amended', etc.
                url=version.get('href'),
                on_duplicate='ignore',
                media_type='text/html'  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {
                'Senate': 'upper',
                'House': 'lower',
                None: 'legislature'
            }[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime('%Y-%m-%d'),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Esempio n. 45
0
    def scrape_matter(self, matter_link, sess):
        matter_types = {
            "Additions": "other",
            "Administrative Order": "order",
            "Annual Evaluation": "other",
            "Bid Advertisement": "other",
            "Bid Awards": "other",
            "Bid Contract": "contract",
            "Bid Protest": "other",
            "Bid Rejection": "other",
            "Birthday Scroll": "commemoration",
            "Certificate of Appreciation": "commemoration",
            "Change Order": "order",
            "Citizen's Presentation": "other",
            "Commendation": "commemoration",
            "Conflict Waiver": "other",
            "Congratulatory Certificate": "commemoration",
            "Deferrals": "other",
            "Discussion Item": "other",
            "Distinguished Visitor": "other",
            "Joint Meeting/Workshop": "other",
            "Mayoral Veto": "other",
            "Miscellaneous": "other",
            "Nomination": "nomination",
            "Oath of Office": "other",
            "Omnibus Reserve": "bill",
            "Ordinance": "ordinance",
            "Plaque": "commemoration",
            "Presentation": "other",
            "Proclamation": "proclamation",
            "Professional Service Agreement": "contract",
            "Public Hearing": "other",
            "Report": "other",
            "Request for Proposals": "other",
            "Request for Qualifications": "other",
            "Request to Advertise": "other",
            "Resolution": "resolution",
            "Resolution of Sympathy": "resolution",
            "Service Awards": "commemoration",
            "Special Item": "other",
            "Special Presentation": "other",
            "Supplement": "other",
            "Swearing-In": "other",
            "Time Sensitive Items": "other",
            "Withdrawals": "other",
            "Workshop Item": "other",
            "Zoning": "other",
            "Zoning Resolution": "resolution"
        }
        matter_doc = self.lxmlize(matter_link)
        info_dict = self.matter_table_to_dict(matter_doc)
        #we're going to use the year of the intro date as the session
        #until/unless we come up with something better
        intro_date = datetime.strptime(info_dict["Introduced"], "%m/%d/%Y")
        session = sess["identifier"]
        try:
            file_type = info_dict["File Type"]
        except KeyError:
            category = 'other'
        else:
            category = matter_types[file_type]
        if 'File Name' in info_dict:
            title = info_dict["File Name"]
        elif "Title" in info_dict and info_dict["Title"].strip():
            title = info_dict["Title"].strip()
        else:
            self.warning("bill has no title")
            return
        if category == 'other':
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title)
        else:
            bill = Bill(identifier=info_dict["File Number"],
                        legislative_session=session,
                        title=title,
                        classification=category)
        for spons in info_dict["Sponsors"]:
            if spons == "NONE":
                continue
            try:
                name, spons_type = spons.rsplit(",", 1)
            except ValueError:
                name = spons
                spons_type = "Sponsor"
            primary = True if "Prime Sponsor" in spons_type else False
            entity = "person"
            if "committee" in name:
                entity = committee
            bill.add_sponsorship(name, spons_type, entity, primary)
        if "Indexes" in info_dict:
            for subj in info_dict["Indexes"]:
                if subj.strip() and subj.strip() != "NONE":
                    bill.add_subject(subj.strip())
        if "Title" in info_dict and info_dict["Title"].strip():
            note = "bill's long title'"
            if ("Note" in info_dict and info_dict["Note"].strip()):
                note = info_dict["Note"]
            bill.add_abstract(abstract=info_dict["Title"], note=note)
        self.process_action_table(matter_doc, bill)
        bill.add_source(matter_link, note='web')

        yield bill
Esempio n. 46
0
    def parse_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        short_bill_id = re.sub(r'(H|S)([JC])R', r'\1\2', bill_id)
        version_link_node = self.get_node(
            page,
            '//a[contains(@href, "{bill_id}/bill.doc") or contains(@href,'
            '"{bill_id}/bill.pdf")]'.format(bill_id=short_bill_id))

        if version_link_node is None:
            # Bill withdrawn
            self.logger.warning('Bill withdrawn.')
            return
        else:
            source_url = version_link_node.attrib['href']

            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

        if self._is_post_2016:
            title_texts = self.get_nodes(
                page, '//div[@class="StandardText leftDivMargin"]/text()')
            title_texts = list(
                filter(None, [text.strip() for text in title_texts]))
            title_texts = [
                s for s in title_texts if s != ',' and not s.startswith('(BR ')
            ]
            title = ' '.join(title_texts)

            actions = self.get_nodes(
                page, '//div[@class="StandardText leftDivMargin"]/'
                'div[@class="StandardText"][last()]//text()[normalize-space()]'
            )
        else:
            pars = version_link_node.xpath("following-sibling::p")

            if len(pars) == 2:
                title = pars[0].xpath("string()")
                action_p = pars[1]
            else:
                title = pars[0].getprevious().tail
                if not title:
                    self.warning(
                        'walking backwards to get bill title, error prone!')
                    title = pars[0].getprevious().getprevious()
                    while not title.tail:
                        title = title.getprevious()
                    title = title.tail
                    self.warning('got title the dangerous way: %s' % title)
                action_p = pars[0]

            title = re.sub(r'[\s\xa0]+', ' ', title).strip()
            actions = action_p.xpath("string()").split("\n")

        if 'CR' in bill_id:
            bill_type = 'concurrent resolution'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'
        else:
            bill_type = 'bill'

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.subject = self._subjects[bill_id]
        bill.add_source(url)

        bill.add_version_link("Most Recent Version",
                              source_url,
                              media_type=mimetype)

        other_versions = page.xpath(
            '//a[contains(@href, "/recorddocuments/bill/") and'
            ' not(contains(@href, "/bill.pdf")) and'
            ' not(contains(@href, "/bill.doc")) and'
            ' not(contains(@href, "/LM.pdf"))]')

        for version_link in other_versions:
            source_url = version_link.attrib['href']
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

            version_title = version_link.xpath('text()')[0]
            bill.add_version_link(version_title,
                                  source_url,
                                  media_type=mimetype)

        # LM is "Locally Mandated fiscal impact"
        fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]')
        for fiscal_note in fiscal_notes:
            source_url = fiscal_note.attrib['href']
            if source_url.endswith('.doc'):
                mimetype = 'application/msword'
            elif source_url.endswith('.pdf'):
                mimetype = 'application/pdf'

            bill.add_document_link("Fiscal Note",
                                   source_url,
                                   media_type=mimetype)

        for link in page.xpath("//a[contains(@href, 'legislator/')]"):
            bill.add_sponsorship(link.text.strip(),
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        for line in actions:
            line_actions = line.strip().split(';')

            for index, action in enumerate(line_actions):
                action = action.strip()
                if not action:
                    continue

                action_date_text = line.split('-')[0].strip()
                if self._is_post_2016:
                    action_date_string = action_date_text.replace(',', '')
                else:
                    action_date_string = '{} {}'.format(
                        action_date_text, session[0:4])

                # This patch is super hacky, but allows us to better
                # capture actions that screw up the formatting such as
                # veto document links.
                try:
                    action_date = datetime.datetime.strptime(
                        action_date_string, '%b %d %Y')
                    cached_action_date = action_date
                    used_cached_action_date = False
                except ValueError:
                    action_date = cached_action_date
                    used_cached_action_date = True

                # Separate out theif first action on the line.
                if index == 0 and not used_cached_action_date:
                    action = '-'.join(action.split('-')[1:]).strip()
                    if not action:
                        continue

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                # For chamber passage,
                # the only way to determine chamber correctly is
                # how many total people voted on it
                if action.startswith('3rd reading'):
                    votes = re.search(r'(\d+)\-(\d+)', action)
                    if votes:
                        yeas = int(votes.groups(1)[0])
                        nays = int(votes.groups(1)[1])
                        # 50 is the quorum for the house,
                        # and more than the number of senators
                        if yeas + nays > 50:
                            actor = 'lower'
                        elif (yeas + nays > 20) and (yeas + nays < 50):
                            actor = 'upper'

                atype = []
                if 'introduced in' in action:
                    atype.append('introduction')
                    if 'to ' in action:
                        atype.append('referral-committee')
                elif 'signed by Governor' in action:
                    atype.append('executive-signature')
                elif 'vetoed' in action:
                    atype.append('executive-veto')

                    # Get the accompanying veto message document. There
                    # should only be one.
                    veto_document_link = self.get_node(
                        page, '//div[@class="StandardText leftDivMargin"]/'
                        'div[@class="StandardText"][last()]/a[contains(@href,'
                        '"veto.pdf")]')

                    if veto_document_link is not None:
                        bill.add_document_link(
                            "Veto Message",
                            veto_document_link.attrib['href'],
                            on_duplicate='ignore')
                elif re.match(r'^to [A-Z]', action):
                    atype.append('referral-committee')
                elif action == 'adopted by voice vote':
                    atype.append('passage')

                if '1st reading' in action:
                    atype.append('reading-1')
                if '3rd reading' in action:
                    atype.append('reading-3')
                    if 'passed' in action:
                        atype.append('passage')
                if '2nd reading' in action:
                    atype.append('reading-2')
                if 'delivered to secretary of state' in action.lower():
                    atype.append('became-law')

                if 'veto overridden' in action.lower():
                    atype.append('veto-override-passage')

                if 'R' in bill_id and 'adopted by voice vote' in action:
                    atype.append('passage')

                amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*'
                                r'( and \([a-z\d\-]+\))? filed')
                if re.search(amendment_re, action):
                    atype.append('amendment-introduction')

                if not atype:
                    atype = None

                # Capitalize the first letter of the action for nicer
                # display. capitalize() won't work for this because it
                # lowercases all other letters.
                action = (action[0].upper() + action[1:])

                action_date = timezone('America/Kentucky/Louisville').localize(
                    action_date)
                action_date = action_date.strftime('%Y-%m-%d')

                if action:
                    bill.add_action(action,
                                    action_date,
                                    chamber=actor,
                                    classification=atype)

        try:
            votes_link = page.xpath(
                "//a[contains(@href, 'vote_history.pdf')]")[0]
            bill.add_document_link("Vote History", votes_link.attrib['href'])
        except IndexError:
            # No votes
            self.logger.warning(u'No votes found for {}'.format(title))
            pass

        # Ugly Hack Alert!
        # find actions before introduction date and subtract 1 from the year
        # if the date is after introduction
        intro_date = None

        for i, action in enumerate(bill.actions):
            if 'introduction' in action['classification']:
                intro_date = action['date']
                break
            for action in bill.actions[:i]:
                if action['date'] > intro_date:
                    action['date'] = action['date'].replace(
                        year=action['date'].year - 1)
                    self.debug('corrected year for %s', action['action'])

        yield bill
Esempio n. 47
0
    def scrape_bill(self, chamber, session, bill_id, url):
        page = self.lxmlize(url)

        (header, ) = page.xpath('//h3[@class="heading"]/text()')
        title = header.replace(bill_id, "").strip()

        if '.B. ' in bill_id:
            bill_type = 'bill'
        elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '):
            bill_type = 'resolution'
        elif '.C.R. ' in bill_id:
            bill_type = 'concurrent resolution'
        elif '.J.R. ' in bill_id:
            bill_type = 'joint resolution'

        for flag in SUB_BLACKLIST:
            if flag in bill_id:
                bill_id = bill_id.replace(flag, " ")
        bill_id = re.sub("\s+", " ", bill_id).strip()

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        primary_info = page.xpath('//div[@id="billsponsordiv"]')
        for info in primary_info:
            (title,
             name) = [x.strip() for x in info.xpath('.//text()') if x.strip()]
            assert title == "Bill Sponsor:"
            name = name.replace("Sen. ", "").replace("Rep. ", "")
            bill.add_sponsorship(name,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)
        floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()')
        floor_info = [x.strip() for x in floor_info if x.strip()]
        if len(floor_info) in (0, 1):
            # This indicates that no floor sponsor was found
            pass
        elif len(floor_info) == 2:
            assert floor_info[0] == "Floor Sponsor:"
            floor_sponsor = floor_info[1].replace("Sen. ",
                                                  "").replace("Rep. ", "")
            bill.add_sponsorship(floor_sponsor,
                                 classification='cosponsor',
                                 entity_type='person',
                                 primary=False)
        else:
            raise AssertionError("Unexpected floor sponsor HTML found")

        versions = page.xpath(
            '//b[text()="Bill Text"]/following-sibling::ul/li/'
            'a[text() and not(text()=" ")]')

        for version in versions:

            # sometimes the href is on the following <a> tag and the tag we
            # have has an onclick
            url = version.get('href')
            if not url:
                url = version.xpath('following-sibling::a[1]/@href')[0]

            bill.add_version_link(version.xpath('text()')[0].strip(),
                                  url,
                                  media_type='application/pdf')

        for related in page.xpath(
                '//b[text()="Related Documents "]/following-sibling::ul/li/'
                'a[contains(@class,"nlink")]'):
            href = related.xpath('@href')[0]
            if '.fn.pdf' in href:
                bill.add_document_link("Fiscal Note",
                                       href,
                                       media_type='application/pdf')
            else:
                text = related.xpath('text()')[0]
                bill.add_document_link(text,
                                       href,
                                       media_type='application/pdf')

        subjects = []
        for link in page.xpath("//a[contains(@href, 'RelatedBill')]"):
            subjects.append(link.text.strip())
        bill.subject = subjects

        status_table = page.xpath('//div[@id="billStatus"]//table')[0]
        yield from self.parse_status(bill, status_table, chamber)

        yield bill
Esempio n. 48
0
    def scrape(self, window=28) :
        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
        for matter in self.matters(n_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Board of Directors"})
            
            legistar_web = matter['legistar_url']
            
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(body_name,
                                           'organization',
                                           entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill. 
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(identifier=identifier,
                                          legislative_session=related_bill_session,
                                          relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link('Board Report',
                                  'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id),
                                   media_type="application/pdf")

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_document_link(attachment['MatterAttachmentName'],
                                           attachment['MatterAttachmentHyperlink'],
                                           media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill
Esempio n. 49
0
    def scrape_bill(self, session, history_url):
        history_xml = self.get(history_url).text
        root = etree.fromstring(history_xml)

        bill_title = root.findtext("caption")
        if (bill_title is None or "Bill does not exist" in history_xml):
            self.warning("Bill does not appear to exist")
            return
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])

        chamber = self.CHAMBERS[bill_id[0]]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification=bill_type)

        bill.add_source(history_url)

        for subject in root.iterfind('subjects/subject'):
            bill.add_subject(subject.text.strip())

        versions = [x for x in self.versions if x[0] == bill_id]
        for version in versions:
            bill.add_version_link(note=self.NAME_SLUGS[version[1][-5]],
                                  url=version[1],
                                  media_type='text/html')

        analyses = [x for x in self.analyses if x[0] == bill_id]
        for analysis in analyses:
            bill.add_document_link(note="Analysis ({})".format(
                self.NAME_SLUGS[analysis[1][-5]]),
                                   url=analysis[1],
                                   media_type='text/html')

        fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id]
        for fiscal_note in fiscal_notes:
            bill.add_document_link(note="Fiscal Note ({})".format(
                self.NAME_SLUGS[fiscal_note[1][-5]]),
                                   url=fiscal_note[1],
                                   media_type='text/html')

        witnesses = [x for x in self.witnesses if x[0] == bill_id]
        for witness in witnesses:
            bill.add_document_link(note="Witness List ({})".format(
                self.NAME_SLUGS[witness[1][-5]]),
                                   url=witness[1],
                                   media_type='text/html')

        for action in root.findall('actions/action'):
            act_date = datetime.datetime.strptime(action.findtext('date'),
                                                  "%m/%d/%Y").date()

            action_number = action.find('actionNumber').text
            actor = {
                'H': 'lower',
                'S': 'upper',
                'E': 'executive'
            }[action_number[0]]

            desc = action.findtext('description').strip()

            if desc == 'Scheduled for public hearing on . . .':
                self.warning("Skipping public hearing action with no date")
                continue

            introduced = False

            if desc == 'Amended':
                atype = 'amendment-passage'
            elif desc == 'Amendment(s) offered':
                atype = 'amendment-introduction'
            elif desc == 'Amendment amended':
                atype = 'amendment-amendment'
            elif desc == 'Amendment withdrawn':
                atype = 'amendment-withdrawal'
            elif desc == 'Passed' or desc == 'Adopted':
                atype = 'passage'
            elif re.match(r'^Received (by|from) the', desc):
                if 'Secretary of the Senate' not in desc:
                    atype = 'introduction'
                else:
                    atype = 'filing'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                atype = 'executive-receipt'
            elif desc.startswith('Signed by the Governor'):
                atype = 'executive-signature'
            elif desc == 'Vetoed by the Governor':
                atype = 'executive-veto'
            elif desc == 'Read first time':
                atype = ['introduction', 'reading-1']
                introduced = True
            elif desc == 'Read & adopted':
                atype = ['passage']
                if not introduced:
                    introduced = True
                    atype.append('introduction')
            elif desc == "Passed as amended":
                atype = 'passage'
            elif (desc.startswith('Referred to')
                  or desc.startswith("Recommended to be sent to ")):
                atype = 'referral-committee'
            elif desc == "Reported favorably w/o amendment(s)":
                atype = 'committee-passage'
            elif desc == "Filed":
                atype = 'filing'
            elif desc == 'Read 3rd time':
                atype = 'reading-3'
            elif desc == 'Read 2nd time':
                atype = 'reading-2'
            elif desc.startswith('Reported favorably'):
                atype = 'committee-passage-favorable'
            else:
                atype = None

            act = bill.add_action(action.findtext('description'),
                                  act_date,
                                  chamber=actor,
                                  classification=atype)

            if atype and 'referral-committee' in atype:
                repls = ['Referred to', "Recommended to be sent to "]
                ctty = desc
                for r in repls:
                    ctty = ctty.replace(r, "").strip()
                act.add_related_entity(name=ctty, entity_type='organization')

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsorship(author,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsorship(coauthor,
                                     classification='cosponsor',
                                     entity_type='person',
                                     primary=False)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsorship(sponsor,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsorship(cosponsor,
                                     classification='cosponsor',
                                     entity_type='person',
                                     primary=False)

        if root.findtext('companions'):
            self._get_companion(bill)

        yield bill
Esempio n. 50
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url, verify=False).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title,
                    classification=bill_type)
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(' ', '')]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, 'short title')

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(note=name, url=href, media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsorship(name=sponsors.strip(), entity_type="organization",
                                         primary=True, classification='primary')
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(classification='primary', name=person,
                                                 entity_type="person", primary=True)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y").strftime('%Y-%m-%d')
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        yield bill
Esempio n. 51
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        hist_url = (f"https://www.legis.iowa.gov/legislation/billTracking/"
                    f"billHistory?billName={bill_id}&ga={session_id}")
        req_session = requests.Session()
        req = requests.get(hist_url)
        if req.status_code == 500:
            self.warning("500 error on {}, skipping".format(hist_url))
            return

        page = lxml.html.fromstring(req.text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath('string(//div[@id="content"]/div[@class='
                           '"divideVert"]/div/div[4]/div[2])').strip()

        if title == "":
            # Sometimes the title is moved, see
            # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88
            title = page.xpath('string(//div[@id="content"]/div[@class='
                               '"divideVert"]/div[4]/div[2])').strip()
            if title == "":
                self.warning("URL: %s gives us an *EMPTY* bill. Aborting." %
                             url)
                return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if "HR" in bill_id or "SR" in bill_id:
            bill_type = ["resolution"]
        elif "HJR" in bill_id or "SJR" in bill_id:
            bill_type = ["joint resolution"]
        elif "HCR" in bill_id or "SCR" in bill_id:
            bill_type = ["concurrent resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = (
            "https://www.legis.iowa.gov/docs/"
            "publications/LG{}/{}/attachments/{}.html")
        version_pdf_url_template = ("https://www.legis.iowa.gov/docs/"
                                    "publications/LG{}/{}/{}.pdf")

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath("string(@value)")

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(" ", ""))

                bill.add_version_link(note=version_name,
                                      url=version_html_url,
                                      media_type="text/html")

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id,
                    bill_id.replace(" ", ""))

                if "Marked Up" in version_name:
                    version_pdf_url = sidebar.xpath(
                        "//iframe[@id='bbContextDoc']/@src")[0]

                bill.add_version_link(note=version_name,
                                      url=version_pdf_url,
                                      media_type="application/pdf")

        sponsors_str = page.xpath('string(//div[@id="content"]/div[@class='
                                  '"divideVert"]/div/div[4]/div[1])').strip()

        if re.search("^By ", sponsors_str):
            sponsors = re.split(",| and ", sponsors_str.split("By ")[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall(r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)",
                                  sponsors_str)

        for sponsor in sponsors:
            sponsor = sponsor.replace(" and", "").strip(" .,")

            # a few sponsors get mangled by our regex
            sponsor = {
                "Means": "Ways & Means",
                "Iowa": "Economic Growth/Rebuild Iowa",
                "Safety": "Public Safety",
                "Resources": "Human Resources",
                "Affairs": "Veterans Affairs",
                "Protection": "Environmental Protection",
                "Government": "State Government",
                "Boef": "De Boef",
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        for tr in page.xpath(
                "//table[contains(@class, 'billActionTable')][1]/tbody/tr"):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[3])").strip()
            action = re.sub(r"\s+", " ", action)

            # Capture any amendment links.
            links = [
                link
                for link in [version["links"] for version in bill.versions]
            ]
            version_urls = [
                link["url"] for link in [i for sub in links for i in sub]
            ]
            if "amendment" in action.lower():
                for anchor in tr.xpath(".//a[1]"):
                    if "-" in anchor.text:
                        # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf
                        amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf"
                        amd_id = anchor.text.replace("-", "").strip()
                        amd_url = amd_pattern.format(session_id, amd_id)
                        amd_name = "Amendment {}".format(anchor.text.strip())

                        if amd_url not in version_urls:
                            bill.add_version_link(note=amd_name,
                                                  url=amd_url,
                                                  media_type="application/pdf")
                            version_urls.append(amd_url)
                        else:
                            self.info(
                                "Already Added {}, skipping".format(amd_url))

            if "S.J." in action or "SCS" in action:
                actor = "upper"
            elif "H.J." in action or "HCS" in action:
                actor = "lower"
            else:
                actor = "legislature"

            action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip()

            if action.startswith("Introduced"):
                atype = ["introduction"]
                if ", referred to" in action:
                    atype.append("referral-committee")
            elif action.startswith("Read first time"):
                atype = "reading-1"
            elif action.startswith("Referred to"):
                atype = "referral-committee"
            elif action.startswith("Sent to Governor"):
                atype = "executive-receipt"
            elif action.startswith("Reported Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Vetoed by Governor"):
                atype = "executive-veto"
            elif action.startswith("Item veto"):
                atype = "executive-veto-line-item"
            elif re.match(r"Passed (House|Senate)", action):
                atype = "passage"
            elif re.match(r"Amendment (S|H)-\d+ filed", action):
                atype = ["amendment-introduction"]
                if ", adopted" in action:
                    atype.append("amendment-passage")
            elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted",
                          action):
                atype = "amendment-passage"
            elif re.match(r"Amendment (S|N)-\d+ lost", action):
                atype = "amendment-failure"
            elif action.startswith("Resolution filed"):
                atype = "introduction"
            elif action.startswith("Resolution adopted"):
                atype = "passage"
            elif action.startswith("Committee report") and action.endswith(
                    "passage."):
                atype = "committee-passage"
            elif action.startswith("Withdrawn"):
                atype = "withdrawal"
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search(r"END OF \d+ ACTIONS", action):
                continue

            if "$history" not in action:
                bill.add_action(description=action,
                                date=date,
                                chamber=actor,
                                classification=atype)

        self.scrape_subjects(bill, bill_id, session, req_session)

        yield bill
Esempio n. 52
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (
                    date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view': self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName'] and self._show_attachment(
                        attachment):
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'].strip(),
                        media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Esempio n. 53
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(
            session_for_url, bill_id)

        try:
            response = requests.get(bill_url)
            self.info("GET (with `requests`) - {}".format(bill_url))
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        # The state website will periodically miss a few bills' titles for a few days
        # These titles will be extant on the bill list page, but missing on the bill detail page
        # The titles are eventually populated
        try:
            bill_title = page.xpath(
                '//div[@id="contentContainer"]/div/div/h2/text()')[0]
        except IndexError:
            self.warning(
                "Couldn't find title for {}; skipping".format(bill_id))
            return False

        bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification='bill')

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]'
        )
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='person')

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath(
            "//div[contains(@class, 'modalBtnGroup')]/"
            "a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link('Bill Text',
                                  version_url,
                                  media_type='application/pdf')

        # yield back votes and bill
        # XXX  yield from
        self.scrape_actions(bill, bill_url, session)
        yield bill
Esempio n. 54
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill.subject = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version_link(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        # bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsorship(name=primary,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search('(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsorship(name=name,
                                                 classification='cosponsor',
                                                 entity_type='person',
                                                 primary=False)
                    else:
                        bill.add_sponsorship(name=name,
                                             classification='cosponsor',
                                             entity_type='person',
                                             primary=False)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            yield from self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    yield from self.scrape_senate_vote(bill, href, date)

            attrs = dict(chamber=chamber,
                         description=action,
                         date=date.strftime("%Y-%m-%d"))
            temp = self.categorizer.categorize(action)
            related_entities = []
            for key, values in temp.items():
                if key != 'classification':
                    for value in values:
                        related_entities.append({"type": key, "name": value})
            attrs.update(classification=temp['classification'],
                         related_entities=related_entities)
            bill.add_action(**attrs)

        yield bill
Esempio n. 55
0
    def _parse_senate_billpage(self, bill_url, year):
        bill_page = self.lxmlize(bill_url)

        # get all the info needed to record the bill
        # TODO probably still needs to be fixed
        bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
        bill_title = bill_page.xpath(
            '//*[@id="lblBillTitle"]')[0].text_content()
        bill_desc = bill_page.xpath(
            '//*[@id="lblBriefDesc"]')[0].text_content()
        # bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()

        bill_type = "bill"
        triplet = bill_id[:3]
        if triplet in bill_types:
            bill_type = bill_types[triplet]

        subs = []
        bid = bill_id.replace(" ", "")

        if bid in self._subjects:
            subs = self._subjects[bid]
            self.info("With subjects for this bill")

        self.info(bid)

        if bid == 'XXXXXX':
            self.info("Skipping Junk Bill")
            return

        bill = Bill(
            bill_id,
            title=bill_desc,
            chamber='upper',
            legislative_session=self._session_id,
            classification=bill_type,
        )
        bill.subject = subs
        bill.add_abstract(bill_desc, note='abstract')
        bill.add_source(bill_url)

        if bill_title:
            bill.add_title(bill_title)

        # Get the primary sponsor
        try:
            sponsor = bill_page.xpath('//a[@id="hlSponsor"]')[0]
        except IndexError:
            sponsor = bill_page.xpath('//span[@id="lSponsor"]')[0]

        bill_sponsor = sponsor.text_content()
        # bill_sponsor_link = sponsor.attrib.get('href')
        bill.add_sponsorship(
            bill_sponsor,
            entity_type='person',
            classification='primary',
            primary=True,
        )

        # cosponsors show up on their own page, if they exist
        cosponsor_tag = bill_page.xpath('//a[@id="hlCoSponsors"]')
        if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.get('href'):
            self._parse_senate_cosponsors(bill,
                                          cosponsor_tag[0].attrib['href'])

        # get the actions
        action_url = bill_page.xpath('//a[@id="hlAllActions"]')
        if len(action_url) > 0:
            action_url = action_url[0].attrib['href']
            self._parse_senate_actions(bill, action_url)

        # stored on a separate page
        versions_url = bill_page.xpath('//a[@id="hlFullBillText"]')
        if len(versions_url) > 0 and versions_url[0].attrib.get('href'):
            self._parse_senate_bill_versions(bill,
                                             versions_url[0].attrib['href'])

        amendment_links = bill_page.xpath(
            '//a[contains(@href,"ShowAmendment.asp")]')
        for link in amendment_links:
            link_text = link.xpath('string(.)').strip()
            if 'adopted' in link_text.lower():
                link_url = link.xpath('@href')[0]
                bill.add_version_link(link_text,
                                      link_url,
                                      media_type='application/pdf',
                                      on_duplicate='ignore')

        yield bill
Esempio n. 56
0
    def old_scrape(self, session=None):
        status_report_url = "http://www.legislature.ohio.gov/legislation/status-reports"

        # ssl verification off due Ohio not correctly implementing SSL
        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        doc = self.get(status_report_url, verify=False).text
        doc = lxml.html.fromstring(doc)
        doc.make_links_absolute(status_report_url)
        xpath = "//div[contains(text(),'{}')]/following-sibling::table"
        status_table = doc.xpath(xpath.format(session))[0]
        status_links = status_table.xpath(
            ".//a[contains(text(),'Excel')]/@href")

        for url in status_links:

            try:
                fname, resp = self.urlretrieve(url)
            except scrapelib.HTTPError as report:
                self.logger.warning("Missing report {}".format(report))
                continue

            sh = xlrd.open_workbook(fname).sheet_by_index(0)

            # once workbook is open, we can remove tempfile
            os.remove(fname)
            for rownum in range(1, sh.nrows):
                bill_id = sh.cell(rownum, 0).value

                bill_type = "resolution" if "R" in bill_id else "bill"
                chamber = "lower" if "H" in bill_id else "upper"

                bill_title = str(sh.cell(rownum, 3).value)

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=bill_title,
                            classification=bill_type)
                bill.add_source(url)
                bill.add_sponsor('primary', str(sh.cell(rownum, 1).value))

                # add cosponsor
                if sh.cell(rownum, 2).value:
                    bill.add_sponsor('cosponsor',
                                     str(sh.cell(rownum, 2).value))

                actor = ""

                # Actions start column after bill title
                for colnum in range(4, sh.ncols - 1):
                    action = str(sh.cell(0, colnum).value)
                    cell = sh.cell(rownum, colnum)
                    date = cell.value

                    if len(action) != 0:
                        if action.split()[0] == 'House':
                            actor = "lower"
                        elif action.split()[0] == 'Senate':
                            actor = "upper"
                        elif action.split()[-1] == 'Governor':
                            actor = "executive"
                        elif action.split()[0] == 'Gov.':
                            actor = "executive"
                        elif action.split()[-1] == 'Gov.':
                            actor = "executive"

                    if action in ('House Intro. Date', 'Senate Intro. Date'):
                        atype = ['bill:introduced']
                        action = action.replace('Intro. Date', 'Introduced')
                    elif action == '3rd Consideration':
                        atype = ['bill:reading:3', 'bill:passed']
                    elif action == 'Sent to Gov.':
                        atype = ['governor:received']
                    elif action == 'Signed By Governor':
                        atype = ['governor:signed']
                    else:
                        atype = ['other']

                    if type(date) == float:
                        date = str(xlrd.xldate_as_tuple(date, 0))
                        date = datetime.datetime.strptime(
                            date, "(%Y, %m, %d, %H, %M, %S)")
                        date = self._tz.localize(date)
                        date = "{:%Y-%m-%d}".format(date)
                        bill.add_action(actor, action, date, type=atype)

                for idx, char in enumerate(bill_id):
                    try:
                        int(char)
                    except ValueError:
                        continue

                    underscore_bill = bill_id[:idx] + "_" + bill_id[idx:]
                    break

                yield from self.scrape_votes_old(bill, underscore_bill,
                                                 session)
                self.scrape_versions_old(bill, underscore_bill, session)
                yield bill
Esempio n. 57
0
def test_full_bill():
    create_jurisdiction()
    person = Person.objects.create(id='person-id', name='Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee', classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act',
                         classification='tax bill', from_organization=org._id)

    bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act',
                      classification='tax bill', from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session")
    bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person',
                         primary=False, entity_id=person.id)
    bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.', note="official")
    bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html')
    bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.json_to_db_id['person-id'] = 'person-id'
    # Since we have to create this person behind the back of the import
    # transaction, we'll fake the json-id to db-id, since they match in this
    # case. This is *really* getting at some implementation detail, but it's
    # the cleanest way to ensure we short-circut the json id lookup.

    BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Esempio n. 58
0
    def get_bill_info(self, chamber, session, bill_detail_url,
                      version_list_url):
        """
        Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        chamber = 'lower' if chamber.lower() == 'house' else chamber
        chamber = 'upper' if chamber.lower() == 'senate' else chamber

        # Get html and parse
        doc = self.lxmlize(bill_detail_url)

        # Check if bill hasn't been transmitted to the other chamber yet
        transmit_check = self.get_node(
            doc,
            '//h1[text()[contains(.,"Bills")]]/following-sibling::ul/li/text()'
        )
        if (transmit_check is not None
                and 'has not been transmitted' in transmit_check.strip()):
            self.logger.debug('Bill has not been transmitted to other chamber '
                              '... skipping {0}'.format(bill_detail_url))
            return

        # Get the basic parts of the bill
        bill_id = self.get_node(doc, '//h1/text()')
        self.logger.debug(bill_id)
        bill_title_text = self.get_node(
            doc,
            '//h2[text()[contains(.,"Description")]]/following-sibling::p/text()'
        )
        if bill_title_text is not None:
            bill_title = bill_title_text.strip()
        else:
            long_desc_url = self.get_node(
                doc, '//a[text()[contains(.,"Long Description")]]/@href')
            long_desc_page = self.lxmlize(long_desc_url)
            long_desc_text = self.get_node(
                long_desc_page, '//h1/'
                'following-sibling::p/text()')
            if long_desc_text is not None:
                bill_title = long_desc_text.strip()
            else:
                bill_title = 'No title found.'
                self.logger.warning('No title found for {}.'.format(bill_id))
        self.logger.debug(bill_title)
        bill_type = {
            'F': 'bill',
            'R': 'resolution',
            'C': 'concurrent resolution'
        }[bill_id[1]]
        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=bill_title,
                    classification=bill_type)

        # Add source
        bill.add_source(bill_detail_url)

        for subject in self._subject_mapping[bill_id]:
            bill.add_subject(subject)

        # Get companion bill.
        companion = doc.xpath('//table[@class="status_info"]//tr[1]/td[2]'
                              '/a[starts-with(@href, "?")]/text()')
        companion = self.make_bill_id(
            companion[0]) if len(companion) > 0 else None
        companion_chamber = self.chamber_from_bill(companion)
        if companion is not None:
            bill.add_companion(companion, chamber=companion_chamber)

        # Grab sponsors
        bill = self.extract_sponsors(bill, doc, chamber)

        # Add Actions performed on the bill.
        bill = self.extract_actions(bill, doc, chamber)

        # Get all versions of the bill.
        bill = self.extract_versions(bill, doc, chamber, version_list_url)

        yield bill
Esempio n. 59
0
    def scrape(self):
        session_name = self.latest_session()
        session = session_name[0:5]
        self._bill_prefix_map = {
            'HB':  {
                'type': 'bill',
                'url_segment': 'bills/house',
            },
            'HR':  {
                'type': 'resolution',
                'url_segment': 'resolutions/house/simple',
            },
            'HCR': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/house/concurrent',
            },
            'HJR': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/house/joint'
            },
            'HC': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/house/concurrent',
            },
            'HJ': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/house/joint',
            },
            'SB': {
                'type': 'bill',
                'url_segment': 'bills/senate',
            },
            'SR': {
                'type': 'resolution',
                'url_segment': 'resolutions/senate/simple',
            },
            'SCR': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/senate/concurrent',
            },
            'SJR': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/senate/joint',
            },
            'SC': {
                'type': 'concurrent resolution',
                'url_segment': 'resolutions/senate/concurrent',
            },
            'SJ': {
                'type': 'joint resolution',
                'url_segment': 'resolutions/senate/joint',
            },
        }

        api_base_url = "https://api.iga.in.gov"
        proxy = {"url": "http://in-proxy.openstates.org"}

        # ah, indiana. it's really, really hard to find
        # pdfs in their web interface. Super easy with
        # the api, but a key needs to be passed
        # in the headers. To make these documents
        # viewable to the public and our scrapers,
        # sunlight's put up a proxy service at this link
        # using our api key for pdf document access.

        client = ApiClient(self)
        r = client.get("bills", session=session)
        all_pages = client.unpaginate(r)
        for b in all_pages:
            bill_id = b["billName"]
            for idx, char in enumerate(bill_id):
                try:
                    int(char)
                except ValueError:
                    continue
                disp_bill_id = bill_id[:idx]+" "+str(int(bill_id[idx:]))
                break

            bill_link = b["link"]
            api_source = api_base_url + bill_link
            try:
                bill_json = client.get("bill", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning('Bill could not be accessed. Skipping.')
                continue

            title = bill_json["title"]
            if title == "NoneNone":
                title = None
            # sometimes title is blank
            # if that's the case, we can check to see if
            # the latest version has a short description
            if not title:
                title = bill_json["latestVersion"]["shortDescription"]

            # and if that doesn't work, use the bill_id but throw a warning
            if not title:
                title = bill_id
                self.logger.warning("Bill is missing a title, using bill id instead.")

            bill_prefix = self._get_bill_id_components(bill_id)[0]

            original_chamber = ("lower" if bill_json["originChamber"].lower() == "house"
                                else "upper")
            bill_type = self._bill_prefix_map[bill_prefix]['type']
            bill = Bill(disp_bill_id,
                        legislative_session=session,
                        chamber=original_chamber,
                        title=title,
                        classification=bill_type)

            bill.add_source(self._get_bill_url(session, bill_id))
            bill.add_source(api_source)

            # sponsors
            for s in bill_json["authors"]:
                bill.add_sponsorship(classification="author",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=True)

            for s in bill_json["coauthors"]:
                bill.add_sponsorship(classification="coauthor",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=False)

            for s in bill_json["sponsors"]:
                bill.add_sponsorship(classification="sponsor",
                                     name=self._get_name(s),
                                     entity_type='person', primary=True)

            for s in bill_json["cosponsors"]:
                bill.add_sponsorship(classification="cosponsor",
                                     name=self._get_name(s),
                                     entity_type='person',
                                     primary=False)

            # actions
            action_link = bill_json["actions"]["link"]
            api_source = api_base_url + action_link

            try:
                actions = client.get("bill_actions", session=session, bill_id=bill_id.lower())
            except scrapelib.HTTPError:
                self.logger.warning("Could not find bill actions page")
                actions = {"items": []}

            for a in actions["items"]:
                action_desc = a["description"]
                if "governor" in action_desc.lower():
                    action_chamber = "executive"
                elif a["chamber"]["name"].lower() == "house":
                    action_chamber = "lower"
                else:
                    action_chamber = "upper"
                date = a["date"]

                if not date:
                    self.logger.warning("Action has no date, skipping")
                    continue

                # convert time to pupa fuzzy time
                date = date.replace('T', ' ')
                # TODO: if we update pupa to accept datetimes we can drop this line
                date = date.split()[0]

                action_type = []
                d = action_desc.lower()
                committee = None

                reading = False
                if "first reading" in d:
                    action_type.append("reading-1")
                    reading = True

                if ("second reading" in d or "reread second time" in d):
                    action_type.append("reading-2")
                    reading = True

                if ("third reading" in d or "reread third time" in d):
                    action_type.append("reading-3")
                    if "passed" in d:
                        action_type.append("passage")
                    if "failed" in d:
                        action_type.append("failure")
                    reading = True

                if "adopted" in d and reading:
                    action_type.append("passage")

                if ("referred" in d and "committee on" in d
                        or "reassigned" in d and "committee on" in d):
                    committee = d.split("committee on")[-1].strip()
                    action_type.append("referral-committee")

                if "committee report" in d:
                    if "pass" in d:
                        action_type.append("committee-passage")
                    if "fail" in d:
                        action_type.append("committee-failure")

                if "amendment" in d and "without amendment" not in d:
                    if "pass" in d or "prevail" in d or "adopted" in d:
                        action_type.append("amendment-passage")
                    if "fail" or "out of order" in d:
                        action_type.append("amendment-failure")
                    if "withdraw" in d:
                        action_type.append("amendment-withdrawal")

                if "signed by the governor" in d:
                    action_type.append("executive-signature")

                if len(action_type) == 0:
                    # calling it other and moving on with a warning
                    self.logger.warning("Could not recognize an action in '{}'".format(
                        action_desc))
                    action_type = None

                a = bill.add_action(chamber=action_chamber,
                                    description=action_desc,
                                    date=date,
                                    classification=action_type)
                if committee:
                    a.add_related_entity(committee, entity_type='organization')

            # subjects
            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
            for subject in subjects:
                bill.add_subject(subject)

            # versions and votes
            for version in bill_json["versions"][::-1]:
                try:
                    version_json = client.get("bill_version",
                                              session=session,
                                              bill_id=version["billName"],
                                              version_id=version["printVersionName"])
                except scrapelib.HTTPError:
                    self.logger.warning("Bill version does not seem to exist.")
                    continue

                yield from self.deal_with_version(version_json, bill, bill_id,
                                                  original_chamber, session, proxy)

            yield bill
Esempio n. 60
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(" ", ""))
        bill_page = self.get(url).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute(
            "http://legislature.idaho.gov/legislation/%s/" % session
        )
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(
            legislative_session=session,
            chamber=chamber,
            identifier=bill_id,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(" ", "")]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, "short title")

        # documents
        doc_links = html.xpath('//div[contains(@class,"insert-page")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get("href")
            if "Engrossment" in name or "Bill Text" in name or "Amendment" in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(
                    note=name, url=href, media_type="application/pdf"
                )

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split("by")
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if "COMMITTEE" in sponsors.upper():
                    bill.add_sponsorship(
                        name=sponsors.strip(),
                        entity_type="organization",
                        primary=True,
                        classification="primary",
                    )
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(
                                classification="primary",
                                name=person,
                                entity_type="person",
                                primary=True,
                            )

        actor = chamber
        last_date = None
        # if a bill has passed a chamber or been 'received from'
        # then the next committee passage is in the opposite chamber
        has_moved_chambers = False
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(
                date + "/" + session[0:4], "%m/%d/%Y"
            ).strftime("%Y-%m-%d")
            if action.startswith("House"):
                actor = "lower"
            elif action.startswith("Senate"):
                actor = "upper"

            # votes
            if "AYES" in action or "NAYS" in action:
                yield from self.parse_vote(
                    actor, date, row[2], session, bill_id, chamber, url
                )
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u"\xa0", " ").strip()
            atype = get_action(actor, action)
            if atype and "passage" in atype:
                has_moved_chambers = True

            if atype and "committee-passage" in atype and has_moved_chambers:
                actor = _OTHER_CHAMBERS[actor]

            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if "to House" in action:
                actor = "lower"
            elif "to Senate" in action:
                actor = "upper"
        yield bill